# Preprocessing data

## Imports

In [1]:
# imports 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set()

## Paths

In [2]:
path = '..\data\clean\electronics.csv'
small_path = '..\data\clean\electronics_small.csv'

## Constants

In [3]:
columns = ['overall','vote','verified','reviewTime','reviewText','summary']

fill_values = {
    'overall':0,
    'vote':0,
    'verified':False,
}

old_dtypes = {
    'overall':np.object,
    'vote':np.object,
    'verified':np.object,
    'reviewTime':np.object,
    'reviewText':np.object,
    'summary':np.object,
    'reviewerID':np.object,
    'asin':np.object,
    'style':np.object,
    'reviewerName':np.object,
    'unixReviewTime':np.object,
    'image':np.object,
}



col_dtypes = {
    'overall':np.float64,
    'vote':np.float64,
    'verified':np.bool,
    'reviewTime':np.datetime64,
    'reviewText':np.object,
    'summary':np.object
}

value_dtypes = {
    'overall':np.int16,
    'vote':np.int64,
}

## Cleaning Data Format

In [4]:
df = pd.read_csv(path,
                 chunksize=1_000_000,
                 parse_dates=[3],
                 thousands=',',
                 dtype=old_dtypes)

In [5]:
i=0

for chunk in df:
    chunk = chunk[(chunk.overall.str.len()<5) & (chunk.overall != 'True') & (chunk.vote != 'True') & (chunk.vote!='False')]
    chunk = chunk[columns]
    chunk.vote = chunk.vote.str.replace(',','')
    new_df = chunk.fillna(fill_values).astype(col_dtypes).astype(value_dtypes)
    new_df.to_csv(small_path,index=False, mode='a', header=(i==0))
    i += 1
    print(i, end=' ')
print(f'\nProcessed {i} chunks')

1 2 3 4 5 6 7 
Processed 7 chunks
