In [14]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
# Surprise Package
import surprise

In [56]:
# Read data
train_df = pd.read_pickle("../../train_df.pkl")
test_df = pd.read_pickle("../../test_df.pkl")

### Merge datasets

In [57]:
data = pd.concat([train_df, test_df], sort = False)

In [58]:
# Explore dataset
data.shape

(290639, 20)

### Drop lines without recommendation

In [59]:
# Missing recommendations
sum(data["also_buy"].isna())

13311

In [60]:
# Percentage
sum(data["also_buy"].isna())/len(data)*100

4.579908408713215

In [61]:
# Drop rows with no recommendation
data.dropna(axis = 0, subset = ["also_buy"], inplace = True)

In [62]:
# Check dropna
sum(data["also_buy"].isna())

0

In [63]:
# New length of the data
len(data)

277328

### Obtaining unique list of videogames

In [80]:
# Unique products
games_list = set(data["productID"].unique())

In [81]:
# Explore list of unique videogames
len(games_list)

9322

### Subsetting recommendations only to videogames

In [72]:
# Convert string to list
data["amz_recommendation"] = data["also_buy"].apply(lambda x: ast.literal_eval(x))

In [84]:
# Keep only products that are videogames
data["game_recommendation"] = data["amz_recommendation"].apply(lambda x: set(x).intersection(games_list))

In [87]:
# Count recommended games per row
data["n_recommended_games"] = data["game_recommendation"].apply(lambda x: len(x))

In [88]:
# Count recommended products per row
data["n_recommended_products"] = data["amz_recommendation"].apply(lambda x: len(x))

In [96]:
# Check that they are mostly different
sum(data["n_recommended_games"] != data["n_recommended_products"])

273635

### Checking for sufficient recommendations

In [98]:
# Check for how many rows we have at least 10 recommendations
sum(data["n_recommended_games"] >= 10)

235594

In [101]:
# As %
sum(data["n_recommended_games"] >= 10)/len(data)*100

84.95139329602492

In [99]:
# Check for how many rows we have at least 15 recommendations
sum(data["n_recommended_games"] >= 15)

223381

In [102]:
# As %
sum(data["n_recommended_games"] >= 15)/len(data)*100

80.54758264582011