# Aquiring and cleaning data

In [1]:
# Necessary modules
import wget
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

## 1. Downloading Amazon review dataset

In [4]:
# link to evaluation data: http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Sports_and_Outdoors_5.json.gz
url = "http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Movies_and_TV_5.json.gz"
wget.download(url, 'C:/Users/kajud/Downloads/Movies_and_TV_5.json.gz')

100% [......................................................................] 791322468 / 791322468

'C:/Users/kajud/Downloads/Movies_and_TV_5.json.gz'

## 2. Unzip and load json data

In [3]:
### load the data
data = []
with gzip.open('data/Movies_and_TV_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of entries
print(len(data))
# first row of the list
print(data[0])

3410019
{'overall': 5.0, 'verified': True, 'reviewTime': '11 9, 2012', 'reviewerID': 'A2M1CU2IRZG0K9', 'asin': '0005089549', 'style': {'Format:': ' VHS Tape'}, 'reviewerName': 'Terri', 'reviewText': "So sorry I didn't purchase this years ago when it first came out!!  This is very good and entertaining!  We absolutely loved it and anticipate seeing it repeatedly.  We actually wore out the cassette years back, so we also purchased this same product on cd.  Best purchase we made out of all!  Would purchase on dvd if we could find one.", 'summary': 'Amazing!', 'unixReviewTime': 1352419200}


In [4]:
# Save json file for later use
with open('Movies_and_TV_5.json', 'w') as outfile:
    json.dump(data, outfile)

In [2]:
# Opening JSON file
f = open('Movies_and_TV_5.json',)
 data = json.load(f)

In [3]:
# total length of list, this number equals total number of entries
print(len(data))
# first row of the list
print(data[0])

3410019
{'overall': 5.0, 'verified': True, 'reviewTime': '11 9, 2012', 'reviewerID': 'A2M1CU2IRZG0K9', 'asin': '0005089549', 'style': {'Format:': ' VHS Tape'}, 'reviewerName': 'Terri', 'reviewText': "So sorry I didn't purchase this years ago when it first came out!!  This is very good and entertaining!  We absolutely loved it and anticipate seeing it repeatedly.  We actually wore out the cassette years back, so we also purchased this same product on cd.  Best purchase we made out of all!  Would purchase on dvd if we could find one.", 'summary': 'Amazing!', 'unixReviewTime': 1352419200}


## 3. Creating dataframe only with necessary columns

In [20]:
df = pd.DataFrame.from_dict(data)
df_purged = df[["reviewText", "overall"]]

In [24]:
# Grouping ratings into three classes (negative-0, neutral-1, positive-2)
df_purged.loc[df_purged['overall'] <= 2., 'label'] = 0 
df_purged.loc[df_purged['overall'] == 3., 'label'] = 1
df_purged.loc[df_purged['overall'] >= 4., 'label'] = 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

In [29]:
# Checking how balanced the dataset it
df_purged['label'].value_counts()

2.0    2694711
0.0     365608
1.0     349700
Name: label, dtype: int64

In [30]:
# Saving cleaned dataset
df_purged.to_csv("data/Movies_and_TV_5_purged.csv")

In [31]:
len(df_purged)

3410019