In [6]:
# Data preprocessing, math and plotting
import os
import numpy as np 
import pandas as pd 
from scipy import stats 
import matplotlib.pyplot as plt
import seaborn as sns 
# ML 
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.metrics import r2_score, confusion_matrix

In [9]:
customers_df = pd.read_csv(r"data\olist_customers_dataset.csv")
geolocation_df = pd.read_csv(r'data/olist_geolocation_dataset.csv') # drop
orders_df = pd.read_csv(r'data/olist_orders_dataset.csv')
order_items_df = pd.read_csv(r'data/olist_order_items_dataset.csv')
order_payments_df = pd.read_csv(r'data/olist_order_payments_dataset.csv')
reviews_df = pd.read_csv(r'data/olist_order_reviews_dataset.csv') 
products_df = pd.read_csv(r'data/olist_products_dataset.csv')
sellers_df = pd.read_csv(r'data/olist_sellers_dataset.csv') 
category_names_translated_df = pd.read_csv(r'data/product_category_name_translation.csv') 
# Get product category translation
products_translated_df = category_names_translated_df.merge(products_df, on='product_category_name', how='left')
products_translated_df.drop(products_translated_df.columns[[0]], axis=1, inplace=True)

In [10]:
merged_df = orders_df.merge(order_items_df, on='order_id', how='left')
merged_df = merged_df.merge(order_payments_df, on='order_id', how='outer', validate='m:m')
merged_df = merged_df.merge(reviews_df, on='order_id', how='outer')
merged_df = merged_df.merge(customers_df, on='customer_id', how='outer')
merged_df = merged_df.merge(sellers_df, on='seller_id', how='outer')
merged_df = merged_df.merge(products_translated_df, on='product_id', how='outer')

### Create Model-based Collaborative filtering system

##### Recommend items to customers based on the purchase history and similarity of ratings provided by other customers. Items bought by the same customers, with the highest ratings, will be recommended to other customers with similar purchase history, weighted by their item rankings. Using a collaborative filtering technique helps predict products that a customer might buy, based on the patterns seen by customer specific preferences.

##### Utility Matrix & Singular Value Decomposition (SVD)
###### A utility matrix consits of all possible customer-item preferences, using item similarity weighted by the customer's ratings of said items. The data associated with each customer represents how much the customer appreciates the item, in respect to others who purchase the same item.
###### The SVD transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Unlike PCA, principle component analysis, this estimator does not center the data before computing the singular value decomposition. This means that it can efficiently work with sparse matices.

In [13]:
# subset of dataset 'df_train'
df_train_subset = merged_df.head(10000)

df_train_util_matrix = df_train_subset.pivot_table(values='review_score', index='customer_id', columns='product_id', fill_value=0)
df_train_util_matrix.head()
              
# transpose the matrix
X = df_train_util_matrix.T
X.head()

# unique products in subset of data
X1 = X

# SVD transformation
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape