In [14]:
import os
import streamlit as st
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import openai
from langdetect import detect
from nltk.tokenize import sent_tokenize

# Initialize OpenAI API key
openai.api_key = os.getenv("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")

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Streamlit UI for the chatbot
st.set_page_config(page_title="Document-Based Chatbot", layout="wide")

st.title("Document-Based Question Answering Chatbot")
st.markdown("Upload your documents and ask questions!")

# Sidebar for translation option
with st.sidebar:
    st.header("Translation Options")
    translate_option = st.radio("Would you like to translate responses?", ('No', 'Yes'))
    target_language = st.selectbox("Select target language", ['French', 'Spanish', 'German']) if translate_option == 'Yes' else None

# Initialize session state for the chat history
if 'chat_history' not in st.session_state:
    st.session_state['chat_history'] = []

# File Uploader
uploaded_files = st.file_uploader("Upload PDF documents", type=["pdf"], accept_multiple_files=True)

# Function to extract text from PDFs
@st.cache
def extract_text_from_pdfs(files):
    documents = {}
    for file in files:
        with pdfplumber.open(file) as pdf:
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text()
            documents[file.name] = full_text
    return documents

# Function to chunk the text into smaller segments
def chunk_text_for_all_docs(documents, max_tokens=500):
    all_chunks = {}
    for filename, text in documents.items():
        sentences = sent_tokenize(text)
        chunks = []
        chunk = []
        tokens_count = 0
        for sentence in sentences:
            tokens = len(sentence.split())
            if tokens_count + tokens > max_tokens:
                chunks.append(" ".join(chunk))
                chunk = []
                tokens_count = 0
            chunk.append(sentence)
            tokens_count += tokens
        if chunk:
            chunks.append(" ".join(chunk))
        all_chunks[filename] = chunks
    return all_chunks

# Function to generate embeddings
def generate_embeddings_for_all_docs(all_chunks):
    all_embeddings = {}
    embedding_ids = []
    chunk_count = 0
    for filename, chunks in all_chunks.items():
        embeddings = model.encode(chunks, convert_to_tensor=False)
        all_embeddings[filename] = embeddings
        for i, chunk in enumerate(chunks):
            embedding_ids.append(f"{filename}-chunk-{i}")
            chunk_count += 1
    return all_embeddings, embedding_ids

# Function to create FAISS index
def create_faiss_index(embeddings):
    dimension = embeddings[next(iter(embeddings))][0].shape[0]  # Embedding size
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    all_embedding_list = []
    for embedding_list in embeddings.values():
        all_embedding_list.extend(embedding_list)
    index.add(np.array(all_embedding_list))
    return index

# Function to query FAISS
def query_faiss(query, all_chunks, index, embedding_ids, top_k=3):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    retrieved_chunks = [all_chunks[embedding_ids[i].split('-chunk-')[0]][int(embedding_ids[i].split('-chunk-')[-1])] for i in I[0]]
    return retrieved_chunks

# Function to generate a response from GPT-3.5
def generate_response_with_context(query, retrieved_chunks):
    prompt = f"User query: {query}\n\nRelevant information from documents:\n{retrieved_chunks}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200
    )
    return response['choices'][0]['message']['content'].strip()

# Function to translate the response
def translate_text(text, target_language):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": f"Translate this text to {target_language}."},
            {"role": "user", "content": text}
        ],
        max_tokens=100
    )
    return response['choices'][0]['message']['content'].strip()

# Main section for query and responses
if uploaded_files:
    with st.spinner('Processing documents...'):
        documents = extract_text_from_pdfs(uploaded_files)
        all_chunks = chunk_text_for_all_docs(documents)
        all_embeddings, embedding_ids = generate_embeddings_for_all_docs(all_chunks)
        index = create_faiss_index(all_embeddings)
        st.success('Documents processed successfully!')

    # Continuous Chat Interface
    user_query = st.text_input("Enter your query:")

    if st.button("Submit Query") and user_query:
        retrieved_chunks = query_faiss(user_query, all_chunks, index, embedding_ids)
        response = generate_response_with_context(user_query, retrieved_chunks)
        
        # Store the chat history
        st.session_state.chat_history.append(f"You: {user_query}")
        st.session_state.chat_history.append(f"Chatbot: {response}")
        
        # Display chat history
        for msg in st.session_state.chat_history:
            st.write(msg)

        # Handle translation if enabled
        if translate_option == 'Yes' and target_language:
            translated_response = translate_text(response, target_language)
            st.write(f"Translated Response ({target_language}): {translated_response}")



NameError: name '_C' is not defined

In [8]:
pip install torch torchvision torchaudio


Collecting torchvision
  Downloading torchvision-0.19.1-cp39-cp39-win_amd64.whl (1.3 MB)
     ---------------------------------------- 1.3/1.3 MB 11.6 MB/s eta 0:00:00
Collecting torchaudio
  Downloading torchaudio-2.4.1-cp39-cp39-win_amd64.whl (2.4 MB)
     ---------------------------------------- 2.4/2.4 MB 25.5 MB/s eta 0:00:00
Installing collected packages: torchvision, torchaudio
Successfully installed torchaudio-2.4.1 torchvision-0.19.1
Note: you may need to restart the kernel to use updated packages.


In [12]:
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia


^C

Note: you may need to restart the kernel to use updated packages.


In [13]:
import torch
print(torch.__version__)  # Check PyTorch version
print(torch.cuda.is_available())  # Check if CUDA is available


NameError: name '_C' is not defined

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Ishtiyak\anaconda3

  added / updated specs:
    - pytorch
    - pytorch-cuda=11.8
    - torchaudio
    - torchvision


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.9.24  |       haa95532_0         131 KB
    certifi-2024.8.30          |   py39haa95532_0         163 KB
    colorama-0.4.6             |   py39haa95532_0          32 KB
    cuda-cccl-12.6.77          |                0          16 KB  nvidia
    cuda-cccl_win


The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/win-64::anaconda-client==1.11.0=py39haa95532_0
  - defaults/win-64::anaconda==custom=py39_2
  - defaults/win-64::anaconda-navigator==2.4.2=py39haa95532_0
  - defaults/win-64::anaconda-project==0.11.1=py39haa95532_0
  - defaults/noarch::argon2-cffi==21.3.0=pyhd3eb1b0_0
  - defaults/noarch::arrow==1.2.2=pyhd3eb1b0_0
  - defaults/win-64::astroid==2.11.7=py39haa95532_0
  - defaults/win-64::astropy==5.1=py39h080aedc_0
  - defaults/noarch::babel==2.9.1=pyhd3eb1b0_0
  - defaults/win-64::bkcharts==0.2=py39haa95532_1
  - defaults/win-64::black==22.6.0=py39haa95532_0
  - defaults/win-64::bokeh==2.4.3=py39haa95532_0
  - defaults/win-64::bottleneck==1.3.5=py39h080aedc_0
  - defaults/win-64::cartopy==0.18.0=py39h80a4efb_1
  - defaults/win-64::click==8.0.4=py39haa95532_0
  - defaults/noarch::click-plugins==1.1.1=pyhd3eb1b0_0
  - defaults/win-64::cligj==0.7.2=p

In [9]:
import os
import streamlit as st
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import openai
from langdetect import detect
from nltk.tokenize import sent_tokenize

# Initialize OpenAI API key (use your key)
openai.api_key = os.getenv("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")

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Streamlit UI for the chatbot
st.set_page_config(page_title="Document-Based Chatbot", layout="wide")
st.title("Document-Based Question Answering Chatbot")
st.markdown("Ask your question, and the chatbot will retrieve answers from stored documents.")

# Sidebar for translation option
with st.sidebar:
    st.header("Translation Options")
    translate_option = st.radio("Would you like to translate responses?", ('No', 'Yes'))
    target_language = st.selectbox("Select target language", ['French', 'Spanish', 'German']) if translate_option == 'Yes' else None

# Path to folder where your PDFs are stored
folder_path = 'C:/Users/Ishtiyak/Desktop/chatbot/documents'

# Function to extract text from PDFs in the folder
@st.cache
def extract_text_from_pdfs(folder_path):
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            with pdfplumber.open(file_path) as pdf:
                full_text = ""
                for page in pdf.pages:
                    full_text += page.extract_text()
                documents[filename] = full_text
    return documents

# Function to chunk the text into smaller segments
def chunk_text_for_all_docs(documents, max_tokens=500):
    all_chunks = {}
    for filename, text in documents.items():
        sentences = sent_tokenize(text)
        chunks = []
        chunk = []
        tokens_count = 0
        for sentence in sentences:
            tokens = len(sentence.split())
            if tokens_count + tokens > max_tokens:
                chunks.append(" ".join(chunk))
                chunk = []
                tokens_count = 0
            chunk.append(sentence)
            tokens_count += tokens
        if chunk:
            chunks.append(" ".join(chunk))
        all_chunks[filename] = chunks
    return all_chunks

# Function to generate embeddings
def generate_embeddings_for_all_docs(all_chunks):
    all_embeddings = {}
    embedding_ids = []
    chunk_count = 0
    for filename, chunks in all_chunks.items():
        embeddings = model.encode(chunks, convert_to_tensor=False)
        all_embeddings[filename] = embeddings
        for i, chunk in enumerate(chunks):
            embedding_ids.append(f"{filename}-chunk-{i}")
            chunk_count += 1
    return all_embeddings, embedding_ids

# Function to create FAISS index
def create_faiss_index(embeddings):
    dimension = embeddings[next(iter(embeddings))][0].shape[0]  # Embedding size
    index = faiss.IndexFlatL2(dimension)  # L2 distance index
    all_embedding_list = []
    for embedding_list in embeddings.values():
        all_embedding_list.extend(embedding_list)
    index.add(np.array(all_embedding_list))
    return index

# Function to query FAISS
def query_faiss(query, all_chunks, index, embedding_ids, top_k=3):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    retrieved_chunks = [all_chunks[embedding_ids[i].split('-chunk-')[0]][int(embedding_ids[i].split('-chunk-')[-1])] for i in I[0]]
    return retrieved_chunks

# Function to generate a response from GPT-3.5
def generate_response_with_context(query, retrieved_chunks):
    prompt = f"User query: {query}\n\nRelevant information from documents:\n{retrieved_chunks}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200
    )
    return response['choices'][0]['message']['content'].strip()

# Function to translate the response
def translate_text(text, target_language):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": f"Translate this text to {target_language}."},
            {"role": "user", "content": text}
        ],
        max_tokens=100
    )
    return response['choices'][0]['message']['content'].strip()

# Main section to process PDFs and answer questions
with st.spinner('Processing documents...'):
    documents = extract_text_from_pdfs(folder_path)
    all_chunks = chunk_text_for_all_docs(documents)
    all_embeddings, embedding_ids = generate_embeddings_for_all_docs(all_chunks)
    index = create_faiss_index(all_embeddings)
    st.success('Documents processed successfully!')

# Continuous Chat Interface
user_query = st.text_input("Enter your query:")

if st.button("Submit Query") and user_query:
    retrieved_chunks = query_faiss(user_query, all_chunks, index, embedding_ids)
    response = generate_response_with_context(user_query, retrieved_chunks)
    
    st.write("Response:")
    st.write(response)

    # Handle translation if enabled
    if translate_option == 'Yes' and target_language:
        translated_response = translate_text(response, target_language)
        st.write(f"Translated Response ({target_language}):")
        st.write(translated_response)



NameError: name '_C' is not defined

In [None]:
!streamlit run app.py

