In [1]:
import os

In [None]:
os.environ["OPENAI_API_KEY"] = input("Paste your OpenAI key here and hit enter:")

In [None]:
#Install Packages
!pip install llama-index==0.5.6
!pip install langchain==0.0.148
!pip install PyPDF2
!pip install streamlit
!pip install openai
# !pip install top2vec
!pip install pdfplumber

In [None]:
!git clone https://github.com/MedX1736/QuizGPT.git

## **Data Extraction From Pdf Files**

In [5]:
import pdfplumber
try :   
  os.mkdir("context")
except :
  print("Data Not Available")
# creating a pdf file object
files = os.listdir("/content/QuizGPT/pdfData/TPRO")
topics =[]
i = 0 
for file in files :
  if file.endswith('.pdf'):
    pdf = pdfplumber.open("/content/QuizGPT/pdfData/TPRO/" + file)
    context= ""
    # Extract and concatenate each page's content
    for page in pdf.pages:
        # creating a page object
        # extracting text  and Titles from pdf
        context += page.extract_text()
        topics.append(page.filter(lambda obj: obj["object_type"] == "char" and "Bold" in obj["fontname"]).extract_text())
    text_output_path = "context/text{}.txt".format(i)
    with open(text_output_path, 'w', encoding='utf-8') as output_file:
          output_file.write(context)
    i = i+1

# **Topics Extraction**

In [6]:
import re
def clean_topics(topics):
  result = []
  for text in topics :
    splitted = text.split("\n")
    for topic in splitted :
      # Remove Greek numbers
      x = re.sub(r'\b[IVX]+\b', '', text)
      x = re.sub(r'^[\d\W_]+\s*', '', topic)
      if x != "" :
        result.append(x.strip())
  result = [topic for topic in result if len(topic) > 4]
  result = [ topic for topic in result if not ("exemple" in topic.lower())]
  result = [ topic for topic in result if not ("chapitre" in topic.lower())] 
  return result

In [7]:
topics = clean_topics(topics)

In [None]:
from top2vec import Top2Vec
import multiprocessing
multiprocessing.cpu_count()

In [None]:
model = Top2Vec(documents=text, speed='learn',workers=multiprocessing.cpu_count())

In [None]:
topics_words, word_scores, topic_nums = model.get_topics()
print(topics_words) # you can directly do model.topic_words[3] too
print(word_scores)

In [8]:
topics

['Concepts préliminaires',
 'I.1. Mesure des algorithmes : O-notation',
 'I.1.1 Introduction',
 'n 40 42 44 45 48 50 52 100 2 . 105 4 . 105 106',
 'I.1.2 Définition',
 'Comportement asymptotique :',
 'Principe d’invariance :',
 'Propriétés de la O-notation',
 'Quelques complexités usuelles',
 'Autres notations',
 'Opérations élémentaires',
 'I.1.3 Règles de calcul de la complexité',
 'I.1.4 Complexité des algorithmes récursifs',
 'a) Application de la première approche « par substitution »',
 'b) Application de la deuxième approche « par vérification »',
 'c) Application de la troisième approche « par identification à des équations connues »',
 'Cas des équations homogènes',
 'Cas des équations non homogènes:',
 'I.1.5 Analyse hybride',
 'Tri par sélection Tri par insertion Tri par fusion Tri de Hoare (qsort)',
 'n O(n2) O(n2) O(n*log n) O(n*log n)',
 'jours 4.23 s 4.56 s',
 'mois 48.51 s 39.45 s',
 'Temps mesuré Temps estimé par la formule',
 'n par les tests sur PC ( 5.65 10-9 * 1.62

In [9]:
len(topics)

231

# **Constructing Index**

In [11]:
import sys
import os
from IPython.display import Markdown, display
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper, ServiceContext
from langchain import OpenAI

def construct_index(directory_path):
    max_input_size = 4096
    num_outputs = 2000
    max_chunk_overlap = 20
    chunk_size_limit = 600 

    # define prompt helpera
    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

    # define LLM
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="text-davinci-003", max_tokens=num_outputs,language='fr'))
    
    # Get documents
    documents = SimpleDirectoryReader(directory_path).load_data()
    
    # Creating the index
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
    index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)
    index.save_to_disk('index.json')
    return index

In [12]:
index  = construct_index("context")

KeyboardInterrupt: ignored

In [None]:
def generate_question():
  questions = []
  index = GPTSimpleVectorIndex.load_from_disk('index.json')
  for topic in topics :
    query = "Poser une Question sur "+ topic + "Et Donner la sous la forme Question : ... Reponse : ..."
    quiz = index.query(query)
    questions.append(quiz)
  return questions

In [14]:
import random
def generate_question_choix_multp():
  questions = []
  index = GPTSimpleVectorIndex.load_from_disk('index.json')
  random_topics = random.sample(topics, 25)
  for topic in random_topics :
    query = "Poser Une Question de choix multiple sur "+ topic + "Et Donner la reponse, sous la forme Question : A.choix1 B.choix2 C.choix3 ...  Reponse : ..."
    quiz = index.query(query)
    questions.append(quiz)
  return questions

In [None]:
questions =  generate_question()

In [15]:
quizs = generate_question_choix_multp()

In [None]:
with open("questions.txt", "w") as file:
    for question in questions:
        file.write(question.response + "\n")

In [17]:
with open("quizs.txt", "w") as file:
    for quiz in quizs:
        file.write(quiz.response + "\n")

In [None]:
def generate_question_choix_multp_topic(top):
  index = GPTSimpleVectorIndex.load_from_disk('index.json')
  query = "Poser Une Question de choix multiple sur "+ top + "Et Donner la sous la forme Question : ... Reponse : ..."
  quiz = index.query(query)
  questions = quiz 
  return questions

In [None]:
def generate_question_topic(top):
  index = GPTSimpleVectorIndex.load_from_disk('index.json')
  query = "Poser une Question sur "+ top + "Et Donner la sous la forme Question : ... Reponse : ..."
  quiz = index.query(query)
  questions = quiz 
  return questions