In [1]:
import os
import pdfplumber
from sentence_transformers import SentenceTransformer
import openai
import numpy as np
from langdetect import detect
from nltk.tokenize import sent_tokenize
import faiss

# Your OpenAI API Key
openai.api_key = "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"

# Initialize the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to extract text from PDFs
def extract_text_from_all_pdfs(folder_path):
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            with pdfplumber.open(file_path) as pdf:
                full_text = ""
                for page in pdf.pages:
                    full_text += page.extract_text()
                documents[filename] = full_text
    return documents

# Function to chunk text
def chunk_text_for_all_docs(documents, max_tokens=500):
    all_chunks = {}
    for filename, text in documents.items():
        sentences = sent_tokenize(text)
        chunks = []
        chunk = []
        tokens_count = 0
        for sentence in sentences:
            tokens = len(sentence.split())
            if tokens_count + tokens > max_tokens:
                chunks.append(" ".join(chunk))
                chunk = []
                tokens_count = 0
            chunk.append(sentence)
            tokens_count += tokens
        if chunk:
            chunks.append(" ".join(chunk))
        all_chunks[filename] = chunks
    return all_chunks

# Function to generate embeddings
def generate_embeddings_for_all_docs(all_chunks):
    all_embeddings = {}
    embedding_ids = []
    chunk_count = 0
    for filename, chunks in all_chunks.items():
        embeddings = model.encode(chunks, convert_to_tensor=False)
        all_embeddings[filename] = embeddings
        for i, chunk in enumerate(chunks):
            embedding_ids.append(f"{filename}-chunk-{i}")
            chunk_count += 1
    return all_embeddings, embedding_ids

# Function to create FAISS index
def create_faiss_index(embeddings):
    dimension = embeddings[next(iter(embeddings))][0].shape[0]  # Embedding size
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    all_embedding_list = []
    for embedding_list in embeddings.values():
        all_embedding_list.extend(embedding_list)
    index.add(np.array(all_embedding_list))
    return index

# Function to perform FAISS query
def query_faiss(query, all_chunks, index, embedding_ids, top_k=3):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    retrieved_chunks = [all_chunks[embedding_ids[i].split('-chunk-')[0]][int(embedding_ids[i].split('-chunk-')[-1])] for i in I[0]]
    return retrieved_chunks

# Function to generate response with GPT-3.5
def generate_response_with_context(query, retrieved_chunks):
    prompt = f"User query: {query}\n\nRelevant information from documents:\n{retrieved_chunks}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200
    )
    return response['choices'][0]['message']['content'].strip()

# Function to translate the text
def translate_text(text, target_language):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": f"Translate this text to {target_language}."},
            {"role": "user", "content": text}
        ],
        max_tokens=100
    )
    return response['choices'][0]['message']['content'].strip()

# Main chatbot function
def rag_chatbot(all_chunks, index, embedding_ids):
    user_query = input("Please enter your query: ")
    
    # Retrieve relevant chunks
    retrieved_chunks = query_faiss(user_query, all_chunks, index, embedding_ids)
    
    # Generate response
    response = generate_response_with_context(user_query, retrieved_chunks)
    print(f"Response in English: {response}")
    
    # Ask user for language preference
    translate_option = input("Do you want to translate the response? (yes/no): ")
    if translate_option.lower() == 'yes':
        target_language = input("Enter target language (e.g., 'French', 'Spanish', 'German'): ").lower()
        translated_response = translate_text(response, target_language)
        print(f"Translated Response in {target_language}: {translated_response}")

# Example usage
folder_path = 'C:/Users/Ishtiyak/Desktop/chatbot/documents'  # Update this with your actual folder path
documents = extract_text_from_all_pdfs(folder_path)
all_chunks = chunk_text_for_all_docs(documents)
all_embeddings, embedding_ids = generate_embeddings_for_all_docs(all_chunks)
index = create_faiss_index(all_embeddings)

# Run the chatbot
rag_chatbot(all_chunks, index, embedding_ids)


  from tqdm.autonotebook import tqdm, trange







Please enter your query: give me coursera overview
Response in English: Coursera is a prominent online learning platform founded in 2012 by Stanford professors Andrew Ng and Daphne Koller. It collaborates with universities and organizations globally to provide courses, certifications, and degrees in various fields. Coursera aims to offer accessible, high-quality education to learners worldwide for personal and professional development.

Here are some key statistics:
- Total Users: 100+ million learners
- Course Offerings: 7,000+ courses
- Partner Institutions: 275+ leading universities and companies
- Specializations: 600+ programs for in-depth learning
- Professional Certificates: 100+ offerings tailored to job-ready skills
- Degrees Offered: Fully accredited Bachelor's and Master's degrees

Coursera offers individual courses, specializations, professional certificates, MasterTrack™ certificates contributing to Master's degrees, online degrees, and guided projects. The platform featur

In [2]:
def summarize_text(text):
    response = openai.Completion.create(
        engine="gpt-3.5-turbo",
        prompt=f"Summarize this: {text}",
        max_tokens=100
    )
    return response['choices'][0]['text'].strip()
