In [22]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.utils import resample
from sklearn.datasets import fetch_california_housing
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from joblib import dump
from io import BytesIO
import zipfile
import requests
import os
from sklearn.preprocessing import MinMaxScaler
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
os.makedirs(os.path.join(os.path.expanduser('~'), '.kaggle'), exist_ok=True)

In [4]:
os.chmod(os.path.join(os.path.expanduser('~'), '.kaggle', 'kaggle.json'), 600)

In [21]:
!pip show kaggle


Name: kaggle
Version: 1.5.16
Summary: Kaggle API
Home-page: https://github.com/Kaggle/kaggle-api
Author: Kaggle
Author-email: support@kaggle.com
License: Apache 2.0
Location: C:\Python311\Lib\site-packages
Requires: bleach, certifi, python-dateutil, python-slugify, requests, six, tqdm, urllib3
Required-by: 


In [11]:
api = KaggleApi()
api.authenticate()

api.dataset_download_files('asaniczka/amazon-canada-products-2023-2-1m-products', path='./', unzip=True)

In [15]:
#load dataset:
Amazon_dataset=pd.read_csv("amz_ca_total_products_data_processed.csv")

### Dataset preprocessing

In [None]:
## check for null values
Amazon_dataset.isnull().sum()
Amazon_dataset[Amazon_dataset['title'].isnull()]

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,categoryName,isBestSeller,boughtInLastMonth
635743,B07Y3VCPRN,,https://m.media-amazon.com/images/I/71sd5LpMop...,https://www.amazon.ca/dp/B07Y3VCPRN,4.7,948,52.62,0.0,Boys,False,0
1186044,B077Z4RBXV,,https://m.media-amazon.com/images/I/71Uh5bhjhT...,https://www.amazon.ca/dp/B077Z4RBXV,4.8,1403,43.47,0.0,Baby,False,0
1194856,B082DJGQNK,,https://m.media-amazon.com/images/I/81SB1PxL3F...,https://www.amazon.ca/dp/B082DJGQNK,4.8,2388,52.62,0.0,Baby,False,0
1385629,B07MJT954D,,https://m.media-amazon.com/images/I/718lM2BcJo...,https://www.amazon.ca/dp/B07MJT954D,4.6,5698,57.89,0.0,Baby,False,0
1606445,B07FMKMXP7,,https://m.media-amazon.com/images/I/718lM2BcJo...,https://www.amazon.ca/dp/B07FMKMXP7,4.6,5695,29.88,0.0,Girls,False,0
2106084,B07175PTDV,,https://m.media-amazon.com/images/I/31YrpY33G+...,https://www.amazon.ca/dp/B07175PTDV,0.0,0,23.0,0.0,Computer Security Cables,False,0


In [19]:
## Encoding isBestSeller. There is 266 category name, so I won't encode them
Amazon_dataset['isBestSeller']=Amazon_dataset['isBestSeller'].astype(int)
numb_categoryName=Amazon_dataset['categoryName'].nunique()


### Content-based recommendation model

In [23]:
## Scale numerical features

scaler=MinMaxScaler()
Amazon_dataset['reviews_normalized']=scaler.fit_transform(Amazon_dataset[['reviews']])
Amazon_dataset['stars_normalized']=scaler.fit_transform(Amazon_dataset[['stars']])


In [31]:
print(Amazon_dataset['reviews_normalized'])

0          0.003310
1          0.000063
2          0.000145
3          0.002228
4          0.000053
             ...   
2165921    0.000000
2165922    0.000000
2165923    0.000000
2165924    0.000000
2165925    0.000000
Name: reviews_normalized, Length: 2165926, dtype: float64


In [39]:


## Convert numerical data to string
Amazon_dataset['reviews_str']=Amazon_dataset['reviews_normalized'].apply(lambda x: 'reviews_' + str(int(x*10000)) )
Amazon_dataset['stars_str']=Amazon_dataset['stars_normalized'].apply(lambda x: 'stars_' + str(int(x*10)) )



In [40]:
#print(Amazon_dataset['stars_str'])

0          stars_88
1          stars_76
2          stars_80
3          stars_90
4          stars_84
             ...   
2165921     stars_0
2165922     stars_0
2165923     stars_0
2165924     stars_0
2165925     stars_0
Name: stars_str, Length: 2165926, dtype: object


In [47]:
## Combined features

#Amazon_dataset['feature_combined']=Amazon_dataset['title'] + ' ' + Amazon_dataset['categoryName'] + ' ' + Amazon_dataset['reviews_str'] + ' ' + Amazon_dataset['stars_str']
Amazon_dataset['feature_combined']=Amazon_dataset['feature_combined'].fillna(' ')
print(Amazon_dataset['feature_combined'].head())


0    Green Leaf WW3D Wonder Extension Cord Winder, ...
1    8pcs Toilet Seat Bumpers Universal Toilet Repl...
2    YaeCCC 19 Pcs Hole Saw Kit 3/4''(19mm)- 6''(15...
3    LLPT Butyl Putty Tape White 1 Inch x 33Ft Tigh...
4    Lightbeam 16" Long Stem Deep Fry Thermometer w...
Name: feature_combined, dtype: object


In [48]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(Amazon_dataset['feature_combined'])

In [50]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)