In [1]:
# Data preprocessing, math and plotting
import os
import numpy as np 
import pandas as pd 
from scipy import stats 
import matplotlib.pyplot as plt
import seaborn as sns 
# ML 
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.metrics import r2_score, confusion_matrix

In [2]:
customers_df = pd.read_csv(r"data\olist_customers_dataset.csv")
geolocation_df = pd.read_csv(r'data/olist_geolocation_dataset.csv') # drop
orders_df = pd.read_csv(r'data/olist_orders_dataset.csv')
order_items_df = pd.read_csv(r'data/olist_order_items_dataset.csv')
order_payments_df = pd.read_csv(r'data/olist_order_payments_dataset.csv')
reviews_df = pd.read_csv(r'data/olist_order_reviews_dataset.csv') 
products_df = pd.read_csv(r'data/olist_products_dataset.csv')
sellers_df = pd.read_csv(r'data/olist_sellers_dataset.csv') 
category_names_translated_df = pd.read_csv(r'data/product_category_name_translation.csv') 
# Get product category translation
products_translated_df = category_names_translated_df.merge(products_df, on='product_category_name', how='left')
products_translated_df.drop(products_translated_df.columns[[0]], axis=1, inplace=True)

In [3]:
merged_df = orders_df.merge(order_items_df, on='order_id', how='left')
merged_df = merged_df.merge(order_payments_df, on='order_id', how='outer', validate='m:m')
merged_df = merged_df.merge(reviews_df, on='order_id', how='outer')
merged_df = merged_df.merge(customers_df, on='customer_id', how='outer')
merged_df = merged_df.merge(sellers_df, on='seller_id', how='outer')
merged_df = merged_df.merge(products_translated_df, on='product_id', how='outer')

### Create Model-based Collaborative filtering system for Product Recommendation

##### Recommend items to customers based on the purchase history and similarity of ratings provided by other customers. Items bought by the same customers, with the highest ratings, will be recommended to other customers with similar purchase history, weighted by their item rankings. Using a collaborative filtering technique helps predict products that a customer might buy, based on the patterns seen by customer specific preferences.

##### Utility Matrix & Singular Value Decomposition (SVD)
###### A utility matrix consits of all possible customer-item preferences, using item similarity weighted by the customer's ratings of said items. The data associated with each customer represents how much the customer appreciates the item, in respect to others who purchase the same item.
###### The SVD transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Unlike PCA, principle component analysis, this estimator does not center the data before computing the singular value decomposition. This means that it can efficiently work with sparse matices.

In [6]:
# subset of dataset 'df_train'
df_train_subset = merged_df.head(10000)

df_train_util_matrix = df_train_subset.pivot_table(values='review_score', index='customer_id', columns='product_id', fill_value=0)
df_train_util_matrix.head()
            
# transpose the matrix
X = df_train_util_matrix.T
X.head()

# unique products in subset of data
X1 = X

# SVD transformation
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
print(decomposed_matrix.shape)

# Correlation Matrix
correlation_matrix = np.corrcoef(decomposed_matrix)

# Isolating Product ID # 009c09f439988bc06a93d6b8186dce73 from the Correlation Matrix (random selection based on index value[1])
product = X.index[1]
print(product)

# Index number and product ID purchased by customer 
product_names = list(X.index)
product_id = product_names.index(product)

# Correlation for all items with the item purchased by this customer based on items rated by other customers people who bought the same product
correlation_product_ID = correlation_matrix[product_id]
correlation_product_ID.shape

# Recommending the most highly correlated products in sequence with scores of over 0.90
Recommend = list(X.index[correlation_product_ID > 0.90])

# Removes the item already bought by the customer 
Recommend.remove(product) 

(1198, 10)
009c09f439988bc06a93d6b8186dce73


#### Below are the top products to be displayed by the recommendation system to the above customer based on the purchase history of other customers in the website (Correlation score over 90%)
#### (only if we knew what each product ID represented....)

In [7]:
Recommend

['06f442515f5159d999a684dfdb881917',
 '0a2fff0d95ef3bbb7dffc618f9542ba9',
 '1c42a107473d6b20db8ac7772b870e33',
 '1d2498694abb5754d2a497801dc4900e',
 '2edda1e590d6c19e5da8e72acfd2a492',
 '3a2df6d2493defbb3b0f25c4f359423f',
 '3bf87d2001b8176e6a3ad804b80774b4',
 '4629acd4e2c278e7787b2c48b006246e',
 '4f915391faab8ae23c234735d5080c31',
 '59fe488ea6ac9439bc86663f4a564c23',
 '5d847bc4fc80feb2b197cde103963ca5',
 '67a0da468c31cff23c32dbfb6dcb6b70',
 '6a23c2eb9acc78f8ec403be9c6210dd4',
 '6bca83dbf6e081bccb2b23188577ad36',
 '6df23f718baba6b30c0f3a6a516ae0bf',
 '703be52823d96209cb055709ac70e5a6',
 '7094ed51ce16d5ba737ef483df555c6f',
 '752aa5bbc175624474d9dc15558755e1',
 '7c898e0b8ea203dd94ba846627fc34d1',
 '8312d7de87652a517f739c42bb4fd3f0',
 '847ae12131fb2ab1eadf2a377dc1e620',
 '8ce6b041b5fa5dfaa937f53e145ce299',
 '92a73c1b226e585ad9d74a1b8d4e0faf',
 '95aaccc936e7e81419cce63c21062954',
 '9b6bfa37bf400cdadb8f867ce97855ae',
 '9ceac7c9fb2a8689bdba0978375b86ed',
 '9d9734db712d5ab6c3ff4c33700eb34c',
 