<a href="https://colab.research.google.com/github/Lalitha-DS/Capstone/blob/main/Streamlit_with_Retrieval_Augmented_Generation_(RAG)_Using_Project_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas numpy faiss-cpu sentence-transformers streamlit openai langchain chromadb




In [4]:
import zipfile

# Path to the zip file
zip_path = "/content/archive (9).zip"

# Extract to a specific folder (optional)
extract_path = "/content/sample_data/amazon_reviews"

# Unzip the file
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset extracted successfully!")


Dataset extracted successfully!


In [5]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/sample_data/amazon_reviews/Reviews.csv")

# Display the first few rows
print(df.head())


   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [6]:
import re

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
        return text.strip()
    return ""

# Keep only relevant columns
df = df[["ProductId", "Summary", "Text"]].dropna()

# Apply preprocessing
df["Processed_Summary"] = df["Summary"].apply(preprocess_text)
df["Processed_Text"] = df["Text"].apply(preprocess_text)

# Display cleaned data
print(df.head())


    ProductId                Summary  \
0  B001E4KFG0  Good Quality Dog Food   
1  B00813GRG4      Not as Advertised   
2  B000LQOCH0  "Delight" says it all   
3  B000UA0QIQ         Cough Medicine   
4  B006K2ZZ7K            Great taffy   

                                                Text      Processed_Summary  \
0  I have bought several of the Vitality canned d...  good quality dog food   
1  Product arrived labeled as Jumbo Salted Peanut...      not as advertised   
2  This is a confection that has been around a fe...    delight says it all   
3  If you are looking for the secret ingredient i...         cough medicine   
4  Great taffy at a great price.  There was a wid...            great taffy   

                                      Processed_Text  
0  i have bought several of the vitality canned d...  
1  product arrived labeled as jumbo salted peanut...  
2  this is a confection that has been around a fe...  
3  if you are looking for the secret ingredient i...  
4  great 

In [7]:
from sentence_transformers import SentenceTransformer

# Load pre-trained Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

print("Model loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully!


In [8]:
import numpy as np
from tqdm import tqdm

# Function to encode in batches
def batch_encode(texts, batch_size=1000):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_numpy=True)
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# Encode reviews in batches
embeddings = batch_encode(df["Text"].tolist(), batch_size=500)

# Save embeddings for reuse
np.save("review_embeddings.npy", embeddings)


100%|██████████| 1137/1137 [11:50<00:00,  1.60it/s]


In [9]:
import faiss

# Define vector dimension (should match model output)
embedding_dim = embeddings.shape[1]

# Create a FAISS index
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance (Euclidean)
index.add(embeddings)  # Add embeddings to the index

print(f"FAISS index created with {index.ntotal} entries!")


FAISS index created with 568427 entries!


In [10]:
def retrieve_similar_reviews(query, top_k=5):
    # Convert query to an embedding
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Fetch corresponding reviews
    results = df.iloc[indices[0]]

    return results[["ProductId", "Summary", "Text"]]

# Example query test
query = "delicious food"
results = retrieve_similar_reviews(query)

print("Top Matches:")
print(results)



Top Matches:
         ProductId                   Summary  \
119735  B0006GSXS4        Excellent Delicacy   
481233  B0047CKHO6  Good Food = Healthy Baby   
299051  B000JZYM8C        These are AWESOME.   
299156  B000JZYM8W        These are AWESOME.   
376521  B0018KNGDY        These are AWESOME.   

                                                     Text  
119735  Delicious without exception.  These taste good...  
481233  Excellent food for an excellent price....my so...  
299051  Insanely delicious.  Texture and taste are phe...  
299156  Insanely delicious.  Texture and taste are phe...  
376521  Insanely delicious.  Texture and taste are phe...  


In [11]:
print(retrieve_similar_reviews("Amazing coffee flavor"))
print(retrieve_similar_reviews("Terrible packaging and damaged product"))


         ProductId                Summary  \
24994   B0078Y6OX8  Great aroma and taste   
197884  B0078Y6CN0  Great aroma and taste   
297031  B00395DVQS  Great aroma and taste   
57316   B00817GPWQ  Great aroma and taste   
201134  B00395570G  Great aroma and taste   

                                                     Text  
24994   Great flavor, and the aroma is fantastic. If y...  
197884  Great flavor, and the aroma is fantastic. If y...  
297031  Great flavor, and the aroma is fantastic. If y...  
57316   Great flavor, and the aroma is fantastic. If y...  
201134  Great flavor, and the aroma is fantastic. If y...  
         ProductId           Summary  \
159056  B000ETTFOY    poor packaging   
95860   B001E4S86E  packaging sucked   
291561  B0016512RI   Factory Rejects   
468082  B001TXRT0Q           damaged   
379340  B000EGZ98S          couscous   

                                                     Text  
159056  Amazon's packaging really bad, Frst time I ord...  
95860   

In [12]:
!pip install streamlit




In [14]:
%%writefile app.py
import torch
import streamlit as st
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pandas as pd

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-processed dataset and FAISS index
df = pd.read_csv("/content/sample_data/amazon_reviews/Reviews.csv")  # Replace with your file path
df = df[["ProductId", "Summary", "Text"]].dropna()
embeddings = np.load("review_embeddings.npy")  # Pre-generated embeddings
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Function to retrieve similar reviews
def retrieve_similar_reviews(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    results = df.iloc[indices[0]]
    return results[["ProductId", "Summary", "Text"]]

# Streamlit Interface

st.title("Amazon Fine Food Reviews Search")

# User Query Input
query = st.text_input("Enter your query:", "")
if query:
    # Display the search button
    if st.button("Search"):
        st.write("Searching for related reviews...")

        # Perform search and retrieve results
        results = retrieve_similar_reviews(query, top_k=5)

        # Display the results
        if not results.empty:
            st.subheader("Top Matches:")
            for idx, row in results.iterrows():
                st.write(f"**Product ID:** {row['ProductId']}")
                st.write(f"**Summary:** {row['Summary']}")
                st.write(f"**Review:** {row['Text']}")
                st.write("---")
        else:
            st.write("No results found.")

# Optional Interactive Filters

st.sidebar.header("Filter Options")

# Dropdown for categories (optional)
category = st.sidebar.selectbox("Select Product Category:", options=["All", "Food", "Drinks", "Others"])

# Checkboxes for customizing search parameters (e.g., min score)
min_score = st.sidebar.checkbox("Filter by Minimum Score?", value=False)
if min_score:
    score_filter = st.sidebar.slider("Select Minimum Rating:", min_value=1, max_value=5, value=4)
else:
    score_filter = 1

# Filter data based on user input (Optional)
if category != "All":
    filtered_df = df[df["Summary"].str.contains(category, case=False)]
else:
    filtered_df = df

if min_score:
    filtered_df = filtered_df[filtered_df["Score"] >= score_filter]

# Explore related content button
if st.button("Explore Related Content"):
    related_content = retrieve_similar_reviews(query, top_k=10)
    st.write("Exploring related reviews...")
    for idx, row in related_content.iterrows():
        st.write(f"**Product ID:** {row['ProductId']}")
        st.write(f"**Summary:** {row['Summary']}")
        st.write(f"**Review:** {row['Text']}")
        st.write("---")

# Display the layout neatly
st.sidebar.write("Adjust search filters and explore reviews based on your preferences.")



Overwriting app.py


In [20]:
!python -c "import torch; print(torch.__version__, torch.cuda.is_available())"


2.5.1+cu124 True


In [24]:
!nvcc --version



nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [1]:
!pip uninstall torch streamlit -y
!pip install torch torchvision torchaudio streamlit


Found existing installation: torch 2.5.1
Uninstalling torch-2.5.1:
  Successfully uninstalled torch-2.5.1
Found existing installation: streamlit 1.42.0
Uninstalling streamlit-1.42.0:
  Successfully uninstalled streamlit-1.42.0
Collecting torch
  Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting streamlit
  Using cached streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting nvidia-cusparselt-cu12==0.6.2 (from torch)
  Using cached nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting triton==3.2.0 (from torch)
  Using cached triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting torch
  Using cached torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Using cached torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl (906.5 MB)
Using cached streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
Installing collected packages: torch, streamlit
Successfully install

In [2]:
!streamlit run app.py --server.headless true



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.11.29:8501[0m
[0m
[34m  Stopping...[0m
^C


In [16]:
!pip install pyngrok
from pyngrok import ngrok
# Set up ngrok
!ngrok authtoken 2sjdzhgNpvO7N6Ob9gvXgBRZNXc_66mhgbzjCVNAcSxDmtWZY  # Replace with your actual ngrok token

public_url = ngrok.connect(8501)
print(f"Public URL: {public_url}")


Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Downloading pyngrok-7.2.3-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.3
Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Public URL: NgrokTunnel: "https://f331-34-125-11-29.ngrok-free.app" -> "http://localhost:8501"
