# Combining and Wrangling Both Data Sets

## Imports

In [28]:
import json
import gzip
import pandas as pd
import os.path
from dotenv import load_dotenv
import numpy as np

# set some print options
np.set_printoptions(precision=4)
np.set_printoptions(threshold=5)
np.set_printoptions(suppress=True)
pd.set_option('precision', 3, 'notebook_repr_html', True, )

# init random gen
np.random.seed(2)

## Environment Variables

In [29]:
load_dotenv()
grocery_data_in = os.getenv('IN_DATA_FILEPATH') + os.getenv('RAW_META_FILE')
reviews_data_in = os.getenv('IN_DATA_FILEPATH') + os.getenv('RAW_REVIEW_FILE')
# This path is still in the markdown below because haven't found a working way to use a variable there.
doc_filepath = os.getenv('DOCUMENTATION_FILEPATH')

## Introduction
Data Wrangling is the process of collecting, organizing, and determining how well-defined the data is.  
See [Grocery Recommender - Capstone Two](../Grocery_Recommender_-_Capstone_Two.pdf) for details about this project. 

## Load Data 
This public dataset is available online <sup>[1]</sup> and is not included in source/version control for space reasons.  
After downloading the dataset add it to the file location in your .env file; see environments_example.txt.

### Product Dataset

In [30]:
grocery_data = []
with gzip.open(grocery_data_in) as f:
    for l in f:
        grocery_data.append(json.loads(l.strip()))

In [31]:
# Confirm data loaded by checking total number of products.
print(len(grocery_data))

287051


In [32]:
# convert list into pandas dataframe
grocery_df = pd.DataFrame.from_dict(grocery_data)
len(grocery_df)

287051

In [33]:
grocery_df.head(2)

Unnamed: 0,category,tech1,description,fit,title,also_buy,image,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,details
0,"[Grocery & Gourmet Food, Dairy, Cheese & Eggs,...",,"[BEEMSTER GOUDA CHEESE AGED 18/24 MONTHS, Stat...",,Beemster Gouda - Aged 18/24 Months - App. 1.5 Lbs,[],[],,Ariola Imports,[],"165,181 in Grocery & Gourmet Food (","[B0000D9MYM, B0000D9MYL, B00ADHIGBA, B00H9OX59...",Grocery,,,$41.91,681727810,
1,"[Grocery & Gourmet Food, Cooking & Baking, Sug...",,"[Shipped from UK, please allow 10 to 21 busine...",,Trim Healthy Mama Xylitol,"[B01898YHXK, B01BCM6LAC, B00Q4OL47O, B00Q4OL5Q...",[https://images-na.ssl-images-amazon.com/image...,,,[],"315,867 in Grocery & Gourmet Food (",[],Grocery,,,,853347867,


### Reviews Dataset

In [34]:
reviews_data = []
with gzip.open(reviews_data_in) as f:
    for l in f:
        reviews_data.append(json.loads(l.strip()))

In [35]:
# total length equals total number of reviews
print(len(reviews_data))

1143860


In [36]:
# convert list into pandas dataframe
reviews_df = pd.DataFrame.from_dict(reviews_data)
len(reviews_df)

1143860

In [37]:
reviews_df.head(2)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"11 19, 2014",A1QVBUH9E1V6I8,4639725183,Jamshed Mathur,No adverse comment.,Five Stars,1416355200,,,
1,5.0,True,"10 13, 2016",A3GEOILWLK86XM,4639725183,itsjustme,Gift for college student.,Great product.,1476316800,,,


## Merge DataFrames

In [38]:
combined_df = grocery_df.merge(reviews_df, on='asin')
combined_df.shape

(1167889, 29)

In [39]:
# remove whitespace in column names
combined_df.columns = combined_df.columns.str.replace(' ', '')

In [40]:
combined_df.head(2)

Unnamed: 0,category,tech1,description,fit,title,also_buy,image_x,tech2,brand,feature,...,verified,reviewTime,reviewerID,reviewerName,reviewText,summary,unixReviewTime,vote,style,image_y
0,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",,[Lipton Yellow Label Tea use only the finest t...,,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",[https://images-na.ssl-images-amazon.com/image...,,Lipton,[],...,True,"04 29, 2012",A1J205ZK25TZ6W,kez panel project,I make the best brewed iced tea with this yell...,Best for brewed iced tea.,1335657600,8,,
1,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",,[Lipton Yellow Label Tea use only the finest t...,,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",[https://images-na.ssl-images-amazon.com/image...,,Lipton,[],...,True,"04 11, 2008",ACOICLIJQYECU,N D,I have recently started drinking hot tea again...,Not Bad for iced Tea,1207872000,9,,


## Clean Data
See other wrangling notebooks for additional details.

In [41]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1167889 entries, 0 to 1167888
Data columns (total 29 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   category        1167889 non-null  object 
 1   tech1           1167889 non-null  object 
 2   description     1167889 non-null  object 
 3   fit             1167889 non-null  object 
 4   title           1167889 non-null  object 
 5   also_buy        1167889 non-null  object 
 6   image_x         1167889 non-null  object 
 7   tech2           1167889 non-null  object 
 8   brand           1167889 non-null  object 
 9   feature         1167889 non-null  object 
 10  rank            1167889 non-null  object 
 11  also_view       1167889 non-null  object 
 12  main_cat        1167889 non-null  object 
 13  similar_item    1167889 non-null  object 
 14  date            1167889 non-null  object 
 15  price           1167889 non-null  object 
 16  asin            1167889 non-null  ob

In [42]:
# Clean multiple missing types of values
combined_df.replace({'': np.nan, 'NaN': np.nan, 0: np.nan}, inplace=True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1167889 entries, 0 to 1167888
Data columns (total 29 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   category        1167889 non-null  object 
 1   tech1           6035 non-null     object 
 2   description     1167889 non-null  object 
 3   fit             0 non-null        float64
 4   title           1167889 non-null  object 
 5   also_buy        1167889 non-null  object 
 6   image_x         1167889 non-null  object 
 7   tech2           0 non-null        float64
 8   brand           1159512 non-null  object 
 9   feature         1167889 non-null  object 
 10  rank            1167889 non-null  object 
 11  also_view       1167889 non-null  object 
 12  main_cat        1167264 non-null  object 
 13  similar_item    1654 non-null     object 
 14  date            17115 non-null    object 
 15  price           827357 non-null   object 
 16  asin            1167889 non-null  ob

In [43]:
# The price column has several issues with problem data; such as ranges and html without numbers
# See Wrangling_Grocery_and_Gourmet_Food_Meta for additional info.
combined_df['price'] = combined_df['price'].apply(lambda x: np.nan if '{' in str(x) else x)
combined_df['price'] = combined_df['price'].apply(lambda x: np.nan if ' - ' in str(x) else x)

In [44]:
# Clean blank lists in multiple columns
list_columns = ['category', 'description', 'also_buy', 'feature', 'also_view']
for col in combined_df[list_columns]:
    combined_df[col] = combined_df[col].apply(lambda x: np.nan if len(x)==0 else x)

In [45]:
# Remove columns that don't have at least half of the data.
drop_thresh = combined_df.shape[0] * 0.51
combined_df = combined_df.dropna(thresh=drop_thresh, how='all', axis='columns').copy()

In [46]:
for col in combined_df.columns:
    print(f'Total NaN in {col} is {combined_df[col].isnull().sum()}')

Total NaN in category is 0
Total NaN in description is 96824
Total NaN in title is 0
Total NaN in also_buy is 164495
Total NaN in image_x is 0
Total NaN in brand is 8377
Total NaN in rank is 0
Total NaN in also_view is 518601
Total NaN in main_cat is 625
Total NaN in price is 362643
Total NaN in asin is 0
Total NaN in details is 51
Total NaN in overall is 0
Total NaN in verified is 0
Total NaN in reviewTime is 0
Total NaN in reviewerID is 0
Total NaN in reviewerName is 141
Total NaN in reviewText is 395
Total NaN in summary is 220
Total NaN in unixReviewTime is 0
Total NaN in style is 561294


In [47]:
# Drop several columns
# image_x link because this project won't analyze images.
# reviewTime and unixReviewTime because this project won't do time series analysis.
# reviewerName for privacy considerations
combined_df.drop(columns=['image_x', 'reviewTime', 'unixReviewTime', 'reviewerName'], inplace=True)
combined_df.head(2)

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,main_cat,price,asin,details,overall,verified,reviewerID,reviewText,summary,style
0,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",[Lipton Yellow Label Tea use only the finest t...,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",Lipton,"30,937 in Grocery & Gourmet Food (","[B00CREXSHY, B001QTRGAQ, B000JSQK70, B002EYZM4...",Grocery,$12.46,4639725043,,5.0,True,A1J205ZK25TZ6W,I make the best brewed iced tea with this yell...,Best for brewed iced tea.,
1,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",[Lipton Yellow Label Tea use only the finest t...,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",Lipton,"30,937 in Grocery & Gourmet Food (","[B00CREXSHY, B001QTRGAQ, B000JSQK70, B002EYZM4...",Grocery,$12.46,4639725043,,3.0,True,ACOICLIJQYECU,I have recently started drinking hot tea again...,Not Bad for iced Tea,


In [48]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1167889 entries, 0 to 1167888
Data columns (total 17 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   category     1167889 non-null  object 
 1   description  1071065 non-null  object 
 2   title        1167889 non-null  object 
 3   also_buy     1003394 non-null  object 
 4   brand        1159512 non-null  object 
 5   rank         1167889 non-null  object 
 6   also_view    649288 non-null   object 
 7   main_cat     1167264 non-null  object 
 8   price        805246 non-null   object 
 9   asin         1167889 non-null  object 
 10  details      1167838 non-null  object 
 11  overall      1167889 non-null  float64
 12  verified     1167889 non-null  bool   
 13  reviewerID   1167889 non-null  object 
 14  reviewText   1167494 non-null  object 
 15  summary      1167669 non-null  object 
 16  style        606595 non-null   object 
dtypes: bool(1), float64(1), object(15)
memory usag

In [49]:
# Analyze asin
print("Count of products in database", len(combined_df['asin']))
# unique asin
print("Unique count of products in database", len(set(combined_df['asin'])))

Count of products in database 1167889
Unique count of products in database 41280


In [50]:
# This example duplicate asin shows that it is the same product info but the reviews vary
example_duplicate = combined_df.loc[combined_df['asin'] == '4639725043']
example_duplicate.head(3)

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,main_cat,price,asin,details,overall,verified,reviewerID,reviewText,summary,style
0,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",[Lipton Yellow Label Tea use only the finest t...,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",Lipton,"30,937 in Grocery & Gourmet Food (","[B00CREXSHY, B001QTRGAQ, B000JSQK70, B002EYZM4...",Grocery,$12.46,4639725043,,5.0,True,A1J205ZK25TZ6W,I make the best brewed iced tea with this yell...,Best for brewed iced tea.,
1,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",[Lipton Yellow Label Tea use only the finest t...,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",Lipton,"30,937 in Grocery & Gourmet Food (","[B00CREXSHY, B001QTRGAQ, B000JSQK70, B002EYZM4...",Grocery,$12.46,4639725043,,3.0,True,ACOICLIJQYECU,I have recently started drinking hot tea again...,Not Bad for iced Tea,
2,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",[Lipton Yellow Label Tea use only the finest t...,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",Lipton,"30,937 in Grocery & Gourmet Food (","[B00CREXSHY, B001QTRGAQ, B000JSQK70, B002EYZM4...",Grocery,$12.46,4639725043,,5.0,True,A29RCQA5G0B1BA,I like pretty much all of Lipton's tea... I ju...,A Great Cuppa...!,


In [51]:
# Check if any of the duplicates are very similar (both the product and review info).
# This excludes columns that are lists in the comparison to avoid errors.
compare_columns = ['title', 'asin', 'reviewerID', 'reviewText', 'summary']
similar_duplicates = combined_df[combined_df.duplicated(compare_columns)]
similar_duplicates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81341 entries, 161 to 1166228
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   category     81341 non-null  object 
 1   description  77928 non-null  object 
 2   title        81341 non-null  object 
 3   also_buy     73661 non-null  object 
 4   brand        80949 non-null  object 
 5   rank         81341 non-null  object 
 6   also_view    70377 non-null  object 
 7   main_cat     81341 non-null  object 
 8   price        52442 non-null  object 
 9   asin         81341 non-null  object 
 10  details      81341 non-null  object 
 11  overall      81341 non-null  float64
 12  verified     81341 non-null  bool   
 13  reviewerID   81341 non-null  object 
 14  reviewText   81319 non-null  object 
 15  summary      81326 non-null  object 
 16  style        45079 non-null  object 
dtypes: bool(1), float64(1), object(15)
memory usage: 10.6+ MB


In [52]:
# Drop these duplicates.
combined_df.drop_duplicates(subset=compare_columns, keep='first', inplace=True, ignore_index=True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1086548 entries, 0 to 1086547
Data columns (total 17 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   category     1086548 non-null  object 
 1   description  993137 non-null   object 
 2   title        1086548 non-null  object 
 3   also_buy     929733 non-null   object 
 4   brand        1078563 non-null  object 
 5   rank         1086548 non-null  object 
 6   also_view    578911 non-null   object 
 7   main_cat     1085923 non-null  object 
 8   price        752804 non-null   object 
 9   asin         1086548 non-null  object 
 10  details      1086497 non-null  object 
 11  overall      1086548 non-null  float64
 12  verified     1086548 non-null  bool   
 13  reviewerID   1086548 non-null  object 
 14  reviewText   1086175 non-null  object 
 15  summary      1086343 non-null  object 
 16  style        561516 non-null   object 
dtypes: bool(1), float64(1), object(15)
memory usag

In [53]:
similar_duplicates = combined_df[combined_df.duplicated(compare_columns)]
similar_duplicates.head()

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,main_cat,price,asin,details,overall,verified,reviewerID,reviewText,summary,style


In [55]:
# Remove additional characters/formatting in price column
combined_df['price'] = combined_df['price'].apply(lambda x: str(x).replace(',','').replace('$', '')).astype('float')
combined_df.head(3)

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,main_cat,price,asin,details,overall,verified,reviewerID,reviewText,summary,style
0,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",[Lipton Yellow Label Tea use only the finest t...,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",Lipton,"30,937 in Grocery & Gourmet Food (","[B00CREXSHY, B001QTRGAQ, B000JSQK70, B002EYZM4...",Grocery,12.46,4639725043,,5.0,True,A1J205ZK25TZ6W,I make the best brewed iced tea with this yell...,Best for brewed iced tea.,
1,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",[Lipton Yellow Label Tea use only the finest t...,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",Lipton,"30,937 in Grocery & Gourmet Food (","[B00CREXSHY, B001QTRGAQ, B000JSQK70, B002EYZM4...",Grocery,12.46,4639725043,,3.0,True,ACOICLIJQYECU,I have recently started drinking hot tea again...,Not Bad for iced Tea,
2,"[Grocery & Gourmet Food, Beverages, Coffee, Te...",[Lipton Yellow Label Tea use only the finest t...,Lipton Yellow Label Tea (loose tea) - 450g,"[B00886E4K0, B00CREXSHY, B001QTRGAQ, B002EYZM4...",Lipton,"30,937 in Grocery & Gourmet Food (","[B00CREXSHY, B001QTRGAQ, B000JSQK70, B002EYZM4...",Grocery,12.46,4639725043,,5.0,True,A29RCQA5G0B1BA,I like pretty much all of Lipton's tea... I ju...,A Great Cuppa...!,
