In [1]:
!pip install openai pinecone-client pdfplumber sentence-transformers




In [7]:
def summarize_text(text):
    response = openai.Completion.create(
        engine="gpt-3.5-turbo",
        prompt=f"Summarize this: {text}",
        max_tokens=100
    )
    return response['choices'][0]['text'].strip()

In [8]:
import os
import pdfplumber

# Define the folder where your PDFs are located
folder_path = 'C:/Users/Ishtiyak/Desktop/chatbot/documents'

# Function to extract text from all PDFs in a folder
def extract_text_from_all_pdfs(folder_path):
    documents = {}
    
    # Loop through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {filename}")
            with pdfplumber.open(file_path) as pdf:
                full_text = ""
                for page in pdf.pages:
                    full_text += page.extract_text()
            documents[filename] = full_text
    return documents

# Extract text from all PDFs
documents = extract_text_from_all_pdfs(folder_path)

# Check the extracted text from one of the documents
for filename, text in documents.items():
    print(f"First 500 characters of {filename}:")
    print(text[:500])


Processing file: AI-Based_Personalized_E-Learning_Systems_Issues_Challenges_and_Solutions.pdf
Processing file: coursera report.pdf
Processing file: coursera.pdf
Processing file: e-learning-in-theory-practice-and-research.pdf
Processing file: education-13-01216-v2.pdf
Processing file: ELearning-and-a-Case-Study-of-Coursera-and-edX-Online-Platforms.pdf
Processing file: E_learning_Concept_Trends.pdf
Processing file: HOW E-LEARNING PROGRAMS CAN BE MORE.pdf
Processing file: Paper_54-A_Systematic_Literature_Review_on_AI_Algorithms_and_Techniques.pdf
Processing file: The Use of AI in ELearning Recommender Systems.pdf
Processing file: The_Comparison_of_MOOC_Massive_Open_Online_Course_Platforms_of_edX_and_Coursera_Study_Case_Student_of_Programming_Courses.pdf
First 500 characters of AI-Based_Personalized_E-Learning_Systems_Issues_Challenges_and_Solutions.pdf:
Received22June2022,accepted12July2022,dateofpublication26July2022,dateofcurrentversion8August2022.
DigitalObjectIdentifier10.1109/ACCESS.

In [9]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Function to chunk text for all documents
def chunk_text_for_all_docs(documents, max_tokens=500):
    all_chunks = {}
    
    for filename, text in documents.items():
        sentences = sent_tokenize(text)
        chunks = []
        chunk = []
        tokens_count = 0
        
        for sentence in sentences:
            tokens = len(sentence.split())
            if tokens_count + tokens > max_tokens:
                chunks.append(" ".join(chunk))
                chunk = []
                tokens_count = 0
            chunk.append(sentence)
            tokens_count += tokens
        
        if chunk:
            chunks.append(" ".join(chunk))
        
        all_chunks[filename] = chunks
    return all_chunks

# Chunk the text for all documents
all_chunks = chunk_text_for_all_docs(documents)

# Example: Print the number of chunks for one document
for filename, chunks in all_chunks.items():
    print(f"{filename} has {len(chunks)} chunks.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ishtiyak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


AI-Based_Personalized_E-Learning_Systems_Issues_Challenges_and_Solutions.pdf has 16 chunks.
coursera report.pdf has 3 chunks.
coursera.pdf has 11 chunks.
e-learning-in-theory-practice-and-research.pdf has 12 chunks.
education-13-01216-v2.pdf has 8 chunks.
ELearning-and-a-Case-Study-of-Coursera-and-edX-Online-Platforms.pdf has 21 chunks.
E_learning_Concept_Trends.pdf has 9 chunks.
HOW E-LEARNING PROGRAMS CAN BE MORE.pdf has 16 chunks.
Paper_54-A_Systematic_Literature_Review_on_AI_Algorithms_and_Techniques.pdf has 20 chunks.
The Use of AI in ELearning Recommender Systems.pdf has 8 chunks.
The_Comparison_of_MOOC_Massive_Open_Online_Course_Platforms_of_edX_and_Coursera_Study_Case_Student_of_Programming_Courses.pdf has 8 chunks.


In [10]:
from sentence_transformers import SentenceTransformer

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all documents
def generate_embeddings_for_all_docs(all_chunks):
    all_embeddings = {}
    
    for filename, chunks in all_chunks.items():
        print(f"Generating embeddings for {filename}")
        embeddings = model.encode(chunks, convert_to_tensor=True)
        all_embeddings[filename] = embeddings
    return all_embeddings

# Generate embeddings
all_embeddings = generate_embeddings_for_all_docs(all_chunks)


Generating embeddings for AI-Based_Personalized_E-Learning_Systems_Issues_Challenges_and_Solutions.pdf
Generating embeddings for coursera report.pdf
Generating embeddings for coursera.pdf
Generating embeddings for e-learning-in-theory-practice-and-research.pdf
Generating embeddings for education-13-01216-v2.pdf
Generating embeddings for ELearning-and-a-Case-Study-of-Coursera-and-edX-Online-Platforms.pdf
Generating embeddings for E_learning_Concept_Trends.pdf
Generating embeddings for HOW E-LEARNING PROGRAMS CAN BE MORE.pdf
Generating embeddings for Paper_54-A_Systematic_Literature_Review_on_AI_Algorithms_and_Techniques.pdf
Generating embeddings for The Use of AI in ELearning Recommender Systems.pdf
Generating embeddings for The_Comparison_of_MOOC_Massive_Open_Online_Course_Platforms_of_edX_and_Coursera_Study_Case_Student_of_Programming_Courses.pdf


In [11]:
!pip install faiss-cpu




In [12]:
import numpy as np
from sentence_transformers import SentenceTransformer

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to retrieve the most relevant chunks using FAISS
def query_faiss(query, top_k=3):
    # Generate embedding for the query using SentenceTransformer
    query_embedding = model.encode([query])
    
    # Perform the FAISS search
    D, I = index.search(np.array(query_embedding), top_k)  # Ensure the embedding is passed as a numpy array
    
    # Retrieve the matching text chunks
    retrieved_chunks = [all_chunks[embedding_ids[i].split('-chunk-')[0]][int(embedding_ids[i].split('-chunk-')[1])] for i in I[0]]
    
    return retrieved_chunks

# Function to generate response with context
import openai

openai.api_key = "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"

def generate_response_with_context(query, retrieved_chunks):
    prompt = f"User query: {query}\n\nRelevant information from documents:\n{retrieved_chunks}\n\nAnswer:"
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers based on the provided information."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200  # Adjust the max_tokens if needed
    )
    
    return response['choices'][0]['message']['content'].strip()

# Main function to interact with the chatbot
def rag_chatbot():
    # Take query as input from the user
    user_query = input("Please enter your query: ")
    
    # Retrieve the most relevant chunks using FAISS
    retrieved_chunks = query_faiss(user_query)
    
    # Generate the response using GPT-3.5 Turbo
    response = generate_response_with_context(user_query, retrieved_chunks)
    
    return response

# Example usage
response = rag_chatbot()
print(response)


Please enter your query:what is Ai


NameError: name 'index' is not defined

In [43]:
!pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     -------------------------------------- 981.5/981.5 kB 2.8 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=d8c13c1406697fab38594f8d3d628cc47baf998188f42c2ad51b724c7a8a3238
  Stored in directory: c:\users\ishtiyak\appdata\local\pip\cache\wheels\d1\c1\d9\7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [13]:
def translate_text_to_italian(text):
    # Detect the language of the input text
    detected_language = detect(text)
    
    # If the text is already in Italian, return it
    if detected_language == "it":
        return text
    
    # Otherwise, translate to Italian
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that translates text into Italian."},
            {"role": "user", "content": f"Translate this text to Italian: {text}"}
        ],
        max_tokens=100
    )
    
    return response['choices'][0]['message']['content'].strip()

# Example usage
user_query = input("Please enter your query: ")
translated_query = translate_text_to_italian(response)
print(translated_query)

Please enter your query:what is AI


NameError: name 'response' is not defined

In [15]:
import os
import pdfplumber
from sentence_transformers import SentenceTransformer
import openai
import numpy as np
from langdetect import detect
from nltk.tokenize import sent_tokenize
import faiss

# Your OpenAI API Key
openai.api_key = "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"

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from PDFs
def extract_text_from_all_pdfs(folder_path):
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            with pdfplumber.open(file_path) as pdf:
                full_text = ""
                for page in pdf.pages:
                    full_text += page.extract_text()
                documents[filename] = full_text
    return documents

# Function to chunk text
def chunk_text_for_all_docs(documents, max_tokens=500):
    all_chunks = {}
    for filename, text in documents.items():
        sentences = sent_tokenize(text)
        chunks = []
        chunk = []
        tokens_count = 0
        for sentence in sentences:
            tokens = len(sentence.split())
            if tokens_count + tokens > max_tokens:
                chunks.append(" ".join(chunk))
                chunk = []
                tokens_count = 0
            chunk.append(sentence)
            tokens_count += tokens
        if chunk:
            chunks.append(" ".join(chunk))
        all_chunks[filename] = chunks
    return all_chunks

# Function to generate embeddings
def generate_embeddings_for_all_docs(all_chunks):
    all_embeddings = {}
    embedding_ids = []
    chunk_count = 0
    for filename, chunks in all_chunks.items():
        embeddings = model.encode(chunks, convert_to_tensor=False)
        all_embeddings[filename] = embeddings
        for i, chunk in enumerate(chunks):
            embedding_ids.append(f"{filename}-chunk-{i}")
            chunk_count += 1
    return all_embeddings, embedding_ids

# Function to create FAISS index
def create_faiss_index(embeddings):
    dimension = embeddings[next(iter(embeddings))][0].shape[0]  # Embedding size
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    all_embedding_list = []
    for embedding_list in embeddings.values():
        all_embedding_list.extend(embedding_list)
    index.add(np.array(all_embedding_list))
    return index

# Function to perform FAISS query
def query_faiss(query, all_chunks, index, embedding_ids, top_k=3):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    retrieved_chunks = [all_chunks[embedding_ids[i].split('-chunk-')[0]][int(embedding_ids[i].split('-chunk-')[-1])] for i in I[0]]
    return retrieved_chunks

# Function to generate response with GPT-3.5
def generate_response_with_context(query, retrieved_chunks):
    prompt = f"User query: {query}\n\nRelevant information from documents:\n{retrieved_chunks}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200
    )
    return response['choices'][0]['message']['content'].strip()

# Function to translate the text
def translate_text(text, target_language):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": f"Translate this text to {target_language}."},
            {"role": "user", "content": text}
        ],
        max_tokens=100
    )
    return response['choices'][0]['message']['content'].strip()

# Main chatbot function
def rag_chatbot(all_chunks, index, embedding_ids):
    user_query = input("Please enter your query: ")
    
    # Retrieve relevant chunks
    retrieved_chunks = query_faiss(user_query, all_chunks, index, embedding_ids)
    
    # Generate response
    response = generate_response_with_context(user_query, retrieved_chunks)
    print(f"Response in English: {response}")
    
    # Ask user for language preference
    translate_option = input("Do you want to translate the response? (yes/no): ")
    if translate_option.lower() == 'yes':
        target_language = input("Enter target language (e.g., 'French', 'Spanish', 'German'): ").lower()
        translated_response = translate_text(response, target_language)
        print(f"Translated Response in {target_language}: {translated_response}")

# Example usage
folder_path = 'C:/Users/Ishtiyak/Desktop/chatbot/documents'  # Update this with your actual folder path
documents = extract_text_from_all_pdfs(folder_path)
all_chunks = chunk_text_for_all_docs(documents)
all_embeddings, embedding_ids = generate_embeddings_for_all_docs(all_chunks)
index = create_faiss_index(all_embeddings)

# Run the chatbot
rag_chatbot(all_chunks, index, embedding_ids)


Please enter your query:what is Ai


AuthenticationError: Incorrect API key provided: sk-proj-********************************************************************************************************************************************************d1sA. You can find your API key at https://platform.openai.com/account/api-keys.