## Imports

In [5]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

## Data Summary

In [6]:
df = pd.read_csv('../Data/combined_data_cleaned.csv')

In [7]:
df.head(2)

Unnamed: 0,category,description,title,also_buy,brand,rank,main_cat,asin,details,overall,verified,reviewerID,reviewText,summary
0,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",['Lipton Yellow Label Tea use only the finest ...,Lipton Yellow Label Tea (loose tea) - 450g,"['B00886E4K0', 'B00CREXSHY', 'B001QTRGAQ', 'B0...",Lipton,"30,937 in Grocery & Gourmet Food (",Grocery,4639725043,,5.0,True,A1J205ZK25TZ6W,I make the best brewed iced tea with this yell...,Best for brewed iced tea.
1,"['Grocery & Gourmet Food', 'Beverages', 'Coffe...",['Lipton Yellow Label Tea use only the finest ...,Lipton Yellow Label Tea (loose tea) - 450g,"['B00886E4K0', 'B00CREXSHY', 'B001QTRGAQ', 'B0...",Lipton,"30,937 in Grocery & Gourmet Food (",Grocery,4639725043,,3.0,True,ACOICLIJQYECU,I have recently started drinking hot tea again...,Not Bad for iced Tea


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1086548 entries, 0 to 1086547
Data columns (total 14 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   category     1086548 non-null  object 
 1   description  993137 non-null   object 
 2   title        1086548 non-null  object 
 3   also_buy     929733 non-null   object 
 4   brand        1078563 non-null  object 
 5   rank         1042485 non-null  object 
 6   main_cat     1085923 non-null  object 
 7   asin         1086548 non-null  object 
 8   details      1086497 non-null  object 
 9   overall      1086548 non-null  float64
 10  verified     1086548 non-null  bool   
 11  reviewerID   1086548 non-null  object 
 12  reviewText   1086175 non-null  object 
 13  summary      1086335 non-null  object 
dtypes: bool(1), float64(1), object(12)
memory usage: 108.8+ MB


### Explore Data

In [13]:
# Only one of the features is numeric
df.overall.describe()

count    1.086548e+06
mean     4.439878e+00
std      1.065040e+00
min      1.000000e+00
25%      4.000000e+00
50%      5.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: overall, dtype: float64

In [11]:
# list_columns = ['category', 'description', 'also_buy', 'rank']
# df_no_lists = df.drop(list_columns, axis=1)
# df_no_lists.info()
# 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1086548 entries, 0 to 1086547
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   title       1086548 non-null  object 
 1   brand       1078563 non-null  object 
 2   main_cat    1085923 non-null  object 
 3   asin        1086548 non-null  object 
 4   details     1086497 non-null  object 
 5   overall     1086548 non-null  float64
 6   verified    1086548 non-null  bool   
 7   reviewerID  1086548 non-null  object 
 8   reviewText  1086175 non-null  object 
 9   summary     1086335 non-null  object 
dtypes: bool(1), float64(1), object(8)
memory usage: 75.6+ MB


In [12]:
# df_no_lists.describe()

Unnamed: 0,overall
count,1086548.0
mean,4.439878
std,1.06504
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


#### Analyze Categories

In [14]:
df_objects = df.select_dtypes(include='object')
dfo_unique = pd.DataFrame(df_objects.nunique()).reset_index()
dfo_unique.columns = ['Features', 'Number of Categories']
dfo_unique.sort_values(by='Number of Categories', ascending=False)
dfo_unique

Unnamed: 0,Features,Number of Categories
0,category,1107
1,description,36532
2,title,40969
3,also_buy,29115
4,brand,8861
5,rank,36865
6,main_cat,19
7,asin,41280
8,details,40178
9,reviewerID,127496


Except for the main_cat, there are too many unique categories for several traditional types of exploratory data analysis to be helpful.
For example, one hot encoding using pd.get_dummies(df) runs into memory errors even when the biggest features are dropped from the analysis.
Therefore, in later notebooks will use options created specifically for recommendation systems. 