### Import and paths

In [50]:
#required libraries imported from the same carpet
import sys, os, numpy as np
import google.generativeai as genai
import pandas as pd
from src.dataio import load_reviews, build_corpus
from src.retriever import Retriever
from src.qa import build_answer
from src.aspects import tag_aspects, aggregate_aspects
from src.embed_index import build_embeddings, load_embedding_model, VectorIndex

In [29]:
#add the path to the file
csv_path="data/1429_1.csv"  

#------------------------------or use this ---------------------
# import os
# # path (full dataset  that user must place)

# PATH = "data/1429_1.csv"        # <- User use this by default

# if os.path.exists(PATH):
#     CSV_PATH = PATH
#     print(f"Using full dataset : {PATH}")
# else:
#     raise FileNotFoundError(
#         "Didn't find 'data/1429_1.csv' "
#         "Place your CSV on data/ and named '1429_1.csv', or add a sample."
#     )

### Load and build corpus

In [30]:
df=load_reviews(csv_path)
print(df.columns.tolist())
df.head(2)

['id', 'product', 'asins', 'brand', 'categories', 'keys', 'manufacturer', 'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen', 'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id', 'reviews.numHelpful', 'rating', 'reviews.sourceURLs', 'text', 'reviews.title', 'reviews.userCity', 'reviews.userProvince', 'reviews.username']


  df=pd.read_csv(csv_path)


Unnamed: 0,id,product,asins,brand,categories,keys,manufacturer,reviews.date,reviews.dateAdded,reviews.dateSeen,...,reviews.doRecommend,reviews.id,reviews.numHelpful,rating,reviews.sourceURLs,text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username
0,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,This product so far has not disappointed. My c...,Kindle,,,Adapter
1,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,great for beginner or experienced person. Boug...,very fast,,,truman


In [62]:
corpus=build_corpus(df, max_chars=500)
print(f"len of corpus: {len(corpus)}, shape: {corpus.shape}")
corpus.head(5)


len of corpus: 29107, shape: (29107, 3)


Unnamed: 0,product,rating,chunk
0,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",5.0,This product so far has not disappointed. My c...
1,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",5.0,great for beginner or experienced person. Boug...
2,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",5.0,Inexpensive tablet for him to use and learn on...
3,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",4.0,I've had my Fire HD 8 two weeks now and I love...
4,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",4.0,glossy feel on the back it is really amazing t...


### Create retriever

In [32]:
retriever = Retriever(corpus, model_name="sentence-transformers/all-MiniLM-L6-v2")

Batches: 100%|███████████████████████████████████████████████████████████████████████| 910/910 [04:24<00:00,  3.44it/s]


In [40]:
print(len(corpus))

29107


### Q&A and demo

In [33]:
product_demo = corpus["product"].mode().iloc[0] #More frequent product
question="Does the battery long all day?"
chunks=retriever.query(question, product=product_demo, top_k=6)
print(build_answer(question, chunks)["answer"][:1500])

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.84it/s]

question: Does the battery long all day?

Answer (based on recovered reviews):
- My son loves it stays on it all day wish the battery would last longer

-  whatever page you've been browsing lately (but might make for a good accidental parental watch/ control to see what the kids surfed last)-standby battery lasts several days, active use all day, but not like my ipad, where I could leave it on standby for weeks, shorter standby battery life for the FireI use mostly for web browsing, social media, and Kindle books. no other cons noted.

- Birthday gift for wife. Her other tablet battery only lasts a hour. She wanted another one. Great tablet for the price.

- Great for my 5 year old, but battery only lasts 7 hours if it's not being used!

- I got this as part of the Black Friday deals and my wife really loves it for reading books on. The battery lasts several days with moderate use.

- Keeps my grandkids busy all day I recharge it as they take a nap and it's ready to keep them busy all




### Aspects and aggregates, Aspect coverage/polarity

In [34]:
tagged = tag_aspects(corpus)
agg_all=aggregate_aspects(tagged)                        #global
agg_prod=aggregate_aspects(tagged, product_demo)         #per product demo
display(agg_all)
display(agg_prod)

Unnamed: 0,aspect,coverage,mean_polarity,n
0,price,15.48,0.912,4506
1,battery,7.36,0.839,2143
2,quality,3.81,0.891,1109
3,durability,3.56,0.863,1036
4,shipping,0.14,0.976,41


Unnamed: 0,aspect,coverage,mean_polarity,n
0,price,25.11,0.91,2842
1,battery,6.77,0.804,766
2,quality,3.82,0.854,432
3,durability,3.47,0.837,393
4,shipping,0.18,0.95,20


In [39]:
#aspects metrics summary(global)
display(agg_all)

# Optional: normalize coverage to fraction and compute simple "balance"
agg_all["coverage_frac"] = agg_all["coverage"] / 100.0
agg_all["abs_polarity"] = agg_all["mean_polarity"].abs()
print("Aspect coverage (sum):", round(agg_all["coverage_frac"].sum(), 3))
print("Most mentioned aspect:", agg_all.sort_values("coverage", ascending=False).iloc[0].to_dict())

Unnamed: 0,aspect,coverage,mean_polarity,n,coverage_frac,abs_polarity
0,price,15.48,0.912,4506,0.1548,0.912
1,battery,7.36,0.839,2143,0.0736,0.839
2,quality,3.81,0.891,1109,0.0381,0.891
3,durability,3.56,0.863,1036,0.0356,0.863
4,shipping,0.14,0.976,41,0.0014,0.976


Aspect coverage (sum): 0.304
Most mentioned aspect: {'aspect': 'price', 'coverage': 15.48, 'mean_polarity': 0.912, 'n': 4506, 'coverage_frac': 0.1548, 'abs_polarity': 0.912}


### Retrieval quality- Recall@k and MRR@k(Mean Reciprocal Rank) (proxy)

In [61]:
from sklearn.metrics.pairwise import cosine_similarity

E = retriever.embeddings        # <-- here the it is the embeddings (N, d)
model = retriever.model         # <-- and model of embeddings
def retrieval_metrics(E_all, corpus_df, model, k_list=(1,3,5), n_probe=200, seed=42, exclude_self=True, probe_index=None):
    #E_all embeddings of corpus (n,d)
    #corpus 
    #model: SentenceTransformer (for embed queries)
    #k_list: the top k you want to evaluate (top-1,top-2,top-3)
    #n_probe: How many random queries use (subset corpus)
    
    rng=np.random.default_rng(seed)
    n=len(corpus_df)
    probe_idx=rng.choice(n, size=min(n_probe, n), replace=False)
    #probe_idx: index(rows) random of corpus that you'll use as "queries"
    
    
    #1) embed the probe text(queries)
    Q=build_embeddings(corpus_df["chunk"].iloc[probe_idx].tolist(), model) #matrix of embed queries
    
    #2)
    S=cosine_similarity(Q,E_all)  #shape: (n_probe, n) 
    #S[i,j]: similiratity between the query i and document j of corpus
    
    
    #3) ground-truth per probe: product id string
    gt_ground=corpus_df["product"].iloc[probe_idx].values
    #gt_ground[i]: truth product of the query i
    
    #4)initialize counters
    recalls={k: 0 for k in k_list}
    mrrs={k: 0.0 for k in k_list}
    
    #5) loop through each query of test
    for i, prod_q in enumerate(gt_ground):
        #order by similarity
        order=np.argsort(-S[i])  
        
        #exclude the exact same row if you want(optional)
        if exclude_self:
            order=order[order!=probe_idx[i]] #avoids automatch
        
        #products of the documents order by similarity
        ranked_prods=corpus_df["product"].iloc[order].values
        
        #6) search the first position where products matches with query
        match_pos=np.where(ranked_prods==prod_q)[0] #arr positions where there are matches
        first_hit=int(match_pos[0]) if len(match_pos) else None #first position (rank 0-based)
        
        #7) Update Recall@k and MRR@k for each k ordered 
        for k in k_list:
            #Recall@k: ¿does apppear the truth product in the first k´s?
            if prod_q in ranked_prods[:k]:
                recalls[k]+=1
            
            #MRR@k: if the first match its inside the top-k, sum 1/(range+1)
            if first_hit is not None and first_hit<k:
                mrrs[k] += 1.0/(first_hit+1) #ranks are 0-based here
    
    #8) mean over the query numbers
    nQ=len(probe_idx)
    recall_k={f"Recall@{k}": round(recalls[k]/nQ, 3) for k in k_list}
    mrr_k={f"MRR@{k}": round(mrrs[k]/ nQ, 3) for k in k_list}
    return recall_k, mrr_k

recall_k, mrr_k = retrieval_metrics(E, corpus, model, k_list=(1,3,5))
print("Retrieval quality (same-product proxy)\n")
print(recall_k)
print(mrr_k)
            
    

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  3.39it/s]


Retrieval quality (same-product proxy)

{'Recall@1': 0.465, 'Recall@3': 0.76, 'Recall@5': 0.84}
{'MRR@1': 0.465, 'MRR@3': 0.595, 'MRR@5': 0.613}
