# Combining and Wrangling Both Data Sets

## Imports

In [11]:
import json
import gzip
import pandas as pd
import os.path
from dotenv import load_dotenv

## Environment Variables

In [12]:
load_dotenv()
grocery_data_in = os.getenv('IN_DATA_FILEPATH') + os.getenv('RAW_META_FILE')
reviews_data_in = os.getenv('IN_DATA_FILEPATH') + os.getenv('RAW_REVIEW_FILE')
# This path is still in the markdown below because haven't found a working way to use a variable there.
doc_filepath = os.getenv('DOCUMENTATION_FILEPATH')

## Introduction
Data Wrangling is the process of collecting, organizing, and determining how well-defined the data is.  
See [Grocery Recommender - Capstone Two](../Grocery_Recommender_-_Capstone_Two.pdf) for details about this project. 

## Load Data 
This public dataset is available online <sup>[1]</sup> and is not included in source/version control for space reasons.  
After downloading the dataset add it to the file location in your .env file; see environments_example.txt.

### Product Dataset

In [13]:
grocery_data = []
with gzip.open(grocery_data_in) as f:
    for l in f:
        grocery_data.append(json.loads(l.strip()))

In [14]:
# Confirm data loaded by checking total number of products.
print(len(grocery_data))

287051


In [15]:
# convert list into pandas dataframe
grocery_df = pd.DataFrame.from_dict(grocery_data)
len(grocery_df)

287051

In [16]:
grocery_df.head(2)

Unnamed: 0,category,tech1,description,fit,title,also_buy,image,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,details
0,"[Grocery & Gourmet Food, Dairy, Cheese & Eggs,...",,"[BEEMSTER GOUDA CHEESE AGED 18/24 MONTHS, Stat...",,Beemster Gouda - Aged 18/24 Months - App. 1.5 Lbs,[],[],,Ariola Imports,[],"165,181 in Grocery & Gourmet Food (","[B0000D9MYM, B0000D9MYL, B00ADHIGBA, B00H9OX59...",Grocery,,,$41.91,681727810,
1,"[Grocery & Gourmet Food, Cooking & Baking, Sug...",,"[Shipped from UK, please allow 10 to 21 busine...",,Trim Healthy Mama Xylitol,"[B01898YHXK, B01BCM6LAC, B00Q4OL47O, B00Q4OL5Q...",[https://images-na.ssl-images-amazon.com/image...,,,[],"315,867 in Grocery & Gourmet Food (",[],Grocery,,,,853347867,


### Reviews Dataset

In [17]:
reviews_data = []
with gzip.open(reviews_data_in) as f:
    for l in f:
        reviews_data.append(json.loads(l.strip()))

In [18]:
# total length equals total number of reviews
print(len(reviews_data))

1143860


In [19]:
# convert list into pandas dataframe
reviews_df = pd.DataFrame.from_dict(reviews_data)
len(reviews_df)

1143860

In [20]:
reviews_df.head(2)
            

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"11 19, 2014",A1QVBUH9E1V6I8,4639725183,Jamshed Mathur,No adverse comment.,Five Stars,1416355200,,,
1,5.0,True,"10 13, 2016",A3GEOILWLK86XM,4639725183,itsjustme,Gift for college student.,Great product.,1476316800,,,
