# Import libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
import re
import unidecode
import html

# Load the training csv's and combine them into one dataframe

In [2]:
# load all 8 training csv's
df1 = pd.read_csv('reviews/train-1.csv')
df2 = pd.read_csv('reviews/train-2.csv')
df3 = pd.read_csv('reviews/train-3.csv')
df4 = pd.read_csv('reviews/train-4.csv')
df5 = pd.read_csv('reviews/train-5.csv')
df6 = pd.read_csv('reviews/train-6.csv')
df7 = pd.read_csv('reviews/train-7.csv')
df8 = pd.read_csv('reviews/train-8.csv')

# combine the dataframes into one big dataframe
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8])

# Show the tail of the data

In [3]:
df.tail()

Unnamed: 0.1,Unnamed: 0,product_id,product_parent,product_title,vine,verified_purchase,review_headline,review_body,review_date,marketplace_id,product_category_id,label
1147,11940,B0011V9XT6,475533906,Beyoncé: The Beyonce Experience - Live [Blu-ra...,N,N,Blu-ray definitely all regions,I don't know why Amazon and one reviewer here ...,2009-12-29,1,3,False
1148,11944,B000026FJ6,96406179,The Songs of Distant Earth,N,Y,Wohlklingende Entspannung,"Ich mag diese Musik sehr, es trifft genau mein...",2013-12-18,0,6,True
1149,11951,B00K6S2HS4,409414789,Tolles Album,N,Y,Native (Gold Edition),"Tolle Band,<br />tolles Album,<br />tolle Lied...",2015-02-23,0,6,False
1150,11953,B00B1COIRC,716679008,Populaire,N,N,Coup de coeur pour ce film adorable et original,R&eacute;jouissante com&eacute;die grâce &agra...,2015-04-10,0,3,True
1151,11998,B00ECTN372,89201247,Percy Jackson 2 : La mer des monstres,N,Y,Dans la lignée du 1er,"Si vous avez aimé le 1er, celui-ci vous ravira...",2014-02-28,2,3,False


# Get rid of the html ascii codes
Codes like `&eacute;`

In [4]:
# parse html
df['review_headline'] = df['review_headline'].apply(lambda x: html.unescape(str(x)))
df['review_body'] = df['review_body'].apply(lambda x: html.unescape(str(x)))

# Get rid of the breaklines
Breakline is `<br />`

In [5]:
# remove the breaklines
df['review_headline'] = df['review_headline'].apply(lambda x: str(x).replace('<br />', ' '))
df['review_body'] = df['review_body'].apply(lambda x: str(x).replace('<br />', ' '))

# Get rid of the accent spam
Like: `Ist dié Géilsté Sérié dér Wélt`

In [6]:
def remove_accent_spam(text, threshold=0.11):
    
    # the total number of characters
    total = len(text)
    
    # the number of accents
    asciis = 0
    
    # iterate over the characters and check if acute is in their unicode name
    for char in text:
        
        if len(char.encode()) > 1:
            asciis += 1
            
    # the ratio of non ascii characters
    ratio = asciis/total
    
    # just return the original review body if the ratio does not exceed the threshold
    if ratio <= threshold: return text
    
    # return the decoded text
    return unidecode.unidecode(text)

In [7]:
# remove the accent spam
df['review_body'] = df['review_body'].apply(remove_accent_spam)
df['review_headline'] = df['review_headline'].apply(remove_accent_spam)

In [8]:
df.tail()

Unnamed: 0.1,Unnamed: 0,product_id,product_parent,product_title,vine,verified_purchase,review_headline,review_body,review_date,marketplace_id,product_category_id,label
1147,11940,B0011V9XT6,475533906,Beyoncé: The Beyonce Experience - Live [Blu-ra...,N,N,Blu-ray definitely all regions,I don't know why Amazon and one reviewer here ...,2009-12-29,1,3,False
1148,11944,B000026FJ6,96406179,The Songs of Distant Earth,N,Y,Wohlklingende Entspannung,"Ich mag diese Musik sehr, es trifft genau mein...",2013-12-18,0,6,True
1149,11951,B00K6S2HS4,409414789,Tolles Album,N,Y,Native (Gold Edition),"Tolle Band, tolles Album, tolle Lieder, einfac...",2015-02-23,0,6,False
1150,11953,B00B1COIRC,716679008,Populaire,N,N,Coup de coeur pour ce film adorable et original,Réjouissante comédie grâce à laquelle on ne vo...,2015-04-10,0,3,True
1151,11998,B00ECTN372,89201247,Percy Jackson 2 : La mer des monstres,N,Y,Dans la lignée du 1er,"Si vous avez aimé le 1er, celui-ci vous ravira...",2014-02-28,2,3,False


# Fix the mixed up product title and review headlines
The fields were switched up sometimes.

- **Note:**  This didn't fix every single row, some products only have one row and we don't know whether those titles and headlines are correct or not
- **Note:**  If one product had 2 reviews and both with the same headline, but one where the headline and title were mixed up, the title and headline of the correct review would be incorrectly switched and be made wrong by this method

In [9]:
def fix_title_headline(df, row):
    
    # get the possible titles of the product id of this row
    rows = df[df['product_id'] == row['product_id']]
    
    # the unique titles of this product
    unique_titles = rows['product_title'].unique()
    
    # get the review headline
    review_headline = row['review_headline']
    
    # check if the review_headline is in the unique titles
    if review_headline in unique_titles:
        
        # replace the review headline of the row with the title
        row['review_headline'] = row['product_title']
        
        # replace the title of the row with the review headline
        row['product_title'] = review_headline
        
    # return the modified row
    return row

In [16]:
# fix the mixed up product titles and headlines
# Note:  This didn't fix every single row,
#        some products only have one row and we don't know whether those titles and headlines are correct or not
# Note:  If one product had 2 reviews and both with the same headline, but one where the headline and title were mixed up, 
#        the title and headline of the correct review would be incorrectly switched and be made wrong by this method
df = df.apply(lambda x: fix_title_headline(df, x), axis=1)

In [17]:
# Unnamed: 0 == 7394 was wrong, but has been fixed now
df.loc[742]

Unnamed: 0.1,Unnamed: 0,product_id,product_parent,product_title,vine,verified_purchase,review_headline,review_body,review_date,marketplace_id,product_category_id,label
742,7701,B00B4VXQEG,819895224,What About Now (Limited Deluxe Edition im Digi...,N,Y,Bon Jovi cd,"Very satisfied with the purchase of this cd, i...",2013-04-09,3,6,False
742,7317,B0090CVJZ4,577429472,AmazonBasics Lot de 24 fiches bananes plaquées...,Y,N,Parfaite mais....,Ce lot de fiches bananes est de très bonne fin...,2013-11-04,2,13,True
742,7433,B0001NDG70,796529195,the tarantino connection,N,N,True Romance - Coffret Collector 3 DVD [Éditio...,D'accord Tony scott est un réalisateur ringard...,2004-05-15,2,3,True
742,7394,B00B8SSWJE,491997969,Delta Machine (Deluxe Edition),N,Y,súpér,Als Fan ein absolutes Muss. Nach dem letzten K...,2013-04-23,0,6,True
742,7241,2266186671,974337128,L'homme qui voulait être heureux,N,N,Vraiment consternant!!!,Un chapelet de poncifs par chapitre . Un style...,2010-07-14,2,0,True
742,7166,B007ATHNJA,782003376,"Monteverdi, Claudio - L'incoronazione di Poppe...",N,Y,Eine meiner Lieblings - CD´`s,Diese Cd ist ausgesprochener Ohrwurm für Freun...,2013-05-13,0,3,False
742,7554,3125362318,671217915,The Hound of the Baskervilles: Englische Lektü...,N,N,Super spannend,Does the dark night of Dartmoor hide the legen...,2006-03-28,0,24,True
742,7605,B008JGSM6G,345007076,Flow Free,N,Y,addictive,Very addictive game and keeps your mind occupi...,2013-01-19,1,1,False
