## Importing libraries

In [50]:
# Import libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.utils import shuffle
import pickle

## Loading all Gaming category (clenaed, merged file with reviews + metadata)

In [3]:
# Import cleaned dataset
raw_data = pd.read_csv("../../merged_data_edited.csv", index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Column names
raw_data.columns

Index(['Unnamed: 0.1', 'reviewID', 'overall', 'verified', 'reviewTime',
       'reviewerID', 'productID', 'reviewText', 'summary', 'vote', 'style',
       'category', 'title', 'brand', 'rank', 'main_cat', 'description',
       'also_buy', 'also_view', 'feature', 'numberOfReviews'],
      dtype='object')

In [5]:
# Drop first column
data = raw_data.drop(labels = ["Unnamed: 0.1"], axis = 1)

In [26]:
# Explore resulting df
print(data.shape)
data.head(2)

(473824, 20)


Unnamed: 0,reviewID,overall,verified,reviewTime,reviewerID,productID,reviewText,summary,vote,style,category,title,brand,rank,main_cat,description,also_buy,also_view,feature,numberOfReviews
0,0,5,True,2015-10-17,A1HP7NVNPFMA4N,700026657,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
1,1,4,False,2015-07-27,A1JGAP0185YJI6,700026657,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13


In [23]:
# Count unique product IDs
len(data["productID"].unique())

17408

## Loading Game Features to extract only Games from Gaming Category

In [9]:
# Loading file with game features
game_features_df = joblib.load("../../game_feature_predictions.joblib")

In [12]:
# Explore df
print(game_features_df.shape)
game_features_df.head(2)

(28223, 15)


Unnamed: 0,asin,features_joined,logreg_preds_proba_v1,games_logreg_preds_v1,logreg_preds_proba_v2,games_logreg_preds_v2,cnn_preds_proba_v1,games_cnn_preds_v1,cnn_preds_proba_v2,games_cnn_preds_v2,full_preds,new_preds,game,full_model_prediction,new_model_predictions
0,439335310,Grades 2-12 Spelling Program Teaches Spelling ...,0.088128,26,0.257119,25,0.814444,15,0.158219,7,15,25,Phonics Alive! 3: The Speller,rpg_action,sports
1,439339006,Sim City 3000 CD-ROM,0.12725,10,0.401816,25,0.178567,9,0.358299,25,25,25,Sim City 3000,sports,sports


In [16]:
# Unique product IDs
print(len(game_features_df["asin"].unique()))
game_features_df["asin"].unique()

28223


array(['0439335310', '0439339006', '0439335299', ..., 'B01HIZGKOE',
       'B01HIZF7XE', 'B01HJ13III'], dtype=object)

In [22]:
# Create list with unique product IDs, relevant to games only
relevant_products = list(game_features_df["asin"].unique())
len(relevant_products)

28223

## Keeping only Games

In [25]:
# Count how many rows we would keep from the overall file
sum(data["productID"].isin(relevant_products))

294407

In [27]:
# Proportion we would keep
sum(data["productID"].isin(relevant_products))/len(data)

0.6213425238063078

In [28]:
# Keep only rows corresponding to relevant products
relevant_data = data[data["productID"].isin(relevant_products)]

In [30]:
# Check conversion
sum(relevant_data["productID"].isin(relevant_products)) == len(relevant_data)

True

In [32]:
# Explore new dataframe
print(relevant_data.shape)
relevant_data.head(2)

(294407, 20)


Unnamed: 0,reviewID,overall,verified,reviewTime,reviewerID,productID,reviewText,summary,vote,style,category,title,brand,rank,main_cat,description,also_buy,also_view,feature,numberOfReviews
0,0,5,True,2015-10-17,A1HP7NVNPFMA4N,700026657,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13
1,1,4,False,2015-07-27,A1JGAP0185YJI6,700026657,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",0,,"['Video Games', 'PC', 'Games']",Anno 2070,Ubisoft,">#30,230 in Video Games (See Top 100 in Video ...",Video Games,['ANNO 2070BRAND NEW - IN STOCKDVD Rom Softwar...,,"['B013F0IP1C', 'B00JDP1AWU', 'B00XR3YC2E', 'B0...",['A new era: while adhering to the fundamental...,13


## Splitting into Train and Test

In [35]:
# Select ratio and define split cut
train_ratio = 0.7
shuffled_data = shuffle(relevant_data, random_state = 232323)
split_cut = np.int(np.round(len(shuffled_data) * train_ratio))

In [36]:
# Split the data
train_df = shuffled_data.iloc[0:split_cut]
test_df = shuffled_data.iloc[split_cut::]

In [37]:
# Check resulting shapes
print(train_df.shape)
print(test_df.shape)

(206085, 20)
(88322, 20)


In [38]:
# For Test set, keep only rows with products and users present in Train set
test_df = test_df[(test_df['reviewerID'].isin(train_df['reviewerID'])) 
                  & (test_df['productID'].isin(train_df['productID']))]

In [39]:
# Check resulting shapes
print(train_df.shape)
print(test_df.shape)

(206085, 20)
(84554, 20)


## Verifying collaborative filtering will work (All test users and products are present in Train)

In [40]:
# Unique users in full dataset
print(len(relevant_data["reviewerID"].unique()))
# Unique users in full dataset
print(len(relevant_data["productID"].unique()))

52023
10482


In [41]:
# Unique users in Train dataset
print(len(train_df["reviewerID"].unique()))
# Unique users in Train dataset
print(len(train_df["productID"].unique()))

49666
10474


In [42]:
# Unique users in Test dataset
print(len(test_df["reviewerID"].unique()))
# Unique users in Test dataset
print(len(test_df["productID"].unique()))

35963
9970


In [47]:
# All reviewers from Test contained in Train?
sum(test_df["reviewerID"].isin(train_df["reviewerID"].unique())) == len(test_df)

True

In [46]:
# All products from Test contained in Train?
sum(test_df["productID"].isin(train_df["productID"].unique())) == len(test_df)

True

In [48]:
# Exploring shapes of resulting dataframes
print(train_df.shape)
print(test_df.shape)

(206085, 20)
(84554, 20)


## Saving Train and Test files

In [51]:
# Save Train with pickle
train_df.to_pickle("../../train_df.pkl")

In [52]:
# Save Test with pickle
test_df.to_pickle("../../test_df.pkl")