# Loading the data

In [17]:
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

# Data loading
data_path = "/Users/deveshsurve/UNIVERSITY/INFO/7374/Final-Project-SmartCommerce/data_pipeline/data_files/"
customers_df = pd.read_csv(data_path + 'olist_customers_dataset.csv')
orders_df = pd.read_csv(data_path + 'olist_orders_dataset.csv')
order_items_df = pd.read_csv(data_path + 'olist_order_items_dataset.csv')
products_df = pd.read_csv(data_path + 'olist_products_dataset.csv')
product_category_df = pd.read_csv(data_path + 'product_category_name_translation.csv')
geolocation_df = pd.read_csv(data_path + 'olist_geolocation_dataset.csv')

In [24]:
geolocation_df

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.644820,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS
1000159,99900,-27.877125,-52.224882,getulio vargas,RS
1000160,99950,-28.071855,-52.014716,tapejara,RS
1000161,99980,-28.388932,-51.846871,david canabarro,RS


# RecSys - #1 - Collaborative Filtering

Libraries to consider : 
- We read about the scikit-suprise library. Scikit-Surprise is a Python library specifically designed for building and - analyzing recommender systems. 
- It includes a wide range of algorithms for collaborative filtering, such as Singular Value Decomposition (SVD), k-Nearest Neighbors (k-NN), and baseline algorithms. 
- Scikit-Surprise provides functions to evaluate the performance of different algorithms using cross-validation and other metrics like RMSE (Root Mean Squared Error) and MAE (Mean Absolute Error). 



Concern : 
- Ideally this library works best with Ratings and overlapping data

In [2]:
# !pip install scikit-surprise

In [3]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy


In [4]:
merged_orders_df = pd.merge(orders_df, order_items_df, on='order_id')
order_product_customer_data = merged_orders_df[['customer_id', 'product_id', 'order_item_id']]


In [5]:
# Rename columns to match Surprise's requirements
order_product_customer_data.columns = ['userID', 'itemID', 'rating']

# Use a reader to parse the dataframe
reader = Reader(rating_scale=(1, 1))  # Since we only have interaction, rating scale is 1 to 1

# Load the data
surprise_data = Dataset.load_from_df(order_product_customer_data[['userID', 'itemID', 'rating']], reader)

In [6]:
trainset, testset = train_test_split(surprise_data, test_size=0.2)

We face an issue here that kernel is crashing


In [7]:
# algo = KNNBasic()

# # Train the algorithm on the trainset
# algo.fit(trainset)


In [25]:
# predictions = algo.test(testset)

In [9]:
# # Compute and print RMSE
# rmse = accuracy.rmse(predictions)
# print(f'RMSE: {rmse}')

# Recsys - #2 - Content Based Filtering

So while reading about content based filtering. I had an idea. Isn't content based filtering similar to a RAG where we find similar items based on user search. Here that's going to be on the basis of users item history.

Explanation
- Data Preparation: Combine product features into a single string for each product to create a comprehensive representation.
- Document Creation: Convert the product data into Document objects required by the LlamaIndex library.
- Indexing: Build a vector store index from the documents.
- Querying: Define a function to query the index for similar products based on a given product's combined features.
- Evaluation: Use metrics like Precision@K and user feedback to evaluate the recommendations.


In [10]:
# !pip install llama-index-embeddings-openai

In [11]:
import pandas as pd
from llama_index.core import Document, VectorStoreIndex

In [12]:

# Merge to get English product category names
products_df = pd.merge(products_df, product_category_df, on='product_category_name', how='left')

# Fill NaN values
products_df = products_df.fillna('')

In [15]:
# Create a combined feature for indexing
products_df['combined_features'] = products_df.apply(
    lambda x: f"{x['product_category_name_english']} {x['product_name_length']} {x['product_description_length']} {x['product_photos_qty']}",
    axis=1
)


In [28]:
products_df.head()

Unnamed: 0,product_id,product_category_name,product_name_length,product_description_length,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [26]:

# # Convert product data into documents
# documents = []
# for _, row in products_df.iterrows():
#     doc = Document(
#         text=row['combined_features'],
#         metadata={'product_id': row['product_id']}
#     )
#     documents.append(doc)


# # Load documents and build index
# index = VectorStoreIndex.from_documents(documents)

In [27]:
# # Define query function
# def get_similar_products(query_text, index):
#     query_doc = Document(text=query_text)
#     results = index.query(query_doc)
#     return results

In [None]:
# # Example usage
# query_text = "electronics 40 287 1"
# similar_products = get_similar_products(query_text, index)
# for result in similar_products:
#     print(result.metadata['product_id'])

We face an issue here that the product info only contains the name in portuguese. And its translation in english. So we don't have a lot to embed and search on the basis of.