# Data Cleaning and transforming

## Installing

In [72]:
import pandas as pd

## Collecting the data

In [96]:
data = pd.read_csv("dataset/Reviews.csv", index_col='Id')

## Transforming Data

Cleaning invalid values

In [97]:
t_data = data.copy()
# Removing Null Rows
t_data = t_data.dropna(subset=['ProductId', 'UserId', 'Time', 'Text'])

# Removing Invalid Scores
t_data = t_data[(t_data['Score'] >= 0) & (t_data['Score'] <= 5)]

# Removing invalid helpfulness
t_data = t_data[t_data['HelpfulnessNumerator'] <= t_data['HelpfulnessDenominator']]

Adding a new column for *negative* helpfulness review

In [98]:
negativeHelpfulness = data['HelpfulnessDenominator'] - data['HelpfulnessNumerator']
pos = data.columns.get_loc('HelpfulnessNumerator')
t_data.insert(pos+1, 'NegativeHelpfulness', negativeHelpfulness)
t_data = t_data.rename(columns={'HelpfulnessNumerator': 'PositiveHelpfulness'})
t_data = t_data.rename(columns={'HelpfulnessDenominator': 'TotalHelpfulness'})

In [99]:
t_data.head(3)

Unnamed: 0_level_0,ProductId,UserId,ProfileName,PositiveHelpfulness,NegativeHelpfulness,TotalHelpfulness,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,0,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,0,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


Transforming "Time" column into *TIMESTAMP WITH TIME ZONE*

In [100]:
t_data['Time'] = pd.to_datetime(t_data['Time'], unit='s', utc=True)

## Data Preview

In [101]:
t_data.head(3)

Unnamed: 0_level_0,ProductId,UserId,ProfileName,PositiveHelpfulness,NegativeHelpfulness,TotalHelpfulness,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,0,1,5,2011-04-27 00:00:00+00:00,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1,2012-09-07 00:00:00+00:00,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,0,1,4,2008-08-18 00:00:00+00:00,"""Delight"" says it all",This is a confection that has been around a fe...


## Load to CSV

In [102]:
t_data.to_csv('dataset/Reviews_t.csv')