In [16]:
import pandas as pd
from sklearn.utils import shuffle
import string
import re

In [2]:
french = pd.read_csv('../data/source/french.csv')
english = pd.read_csv('../data/source/english.csv')

In [3]:
def n_sampling(df:pd.DataFrame, nb):
    """
    Sample a dataframe based on given conditions
    """
    dataset = pd.DataFrame()
    for score in df.stars.value_counts().index:
        mask = df.stars == score
        sample = shuffle(df[mask], random_state=1).sample(nb)
        dataset = pd.concat([dataset, sample])
    
    dataset = shuffle(dataset)
    return dataset

In [4]:
df1 = n_sampling(french, 500)
df2 = n_sampling(english, 500)

In [5]:
def french_scores(col):
    if col == 1 or col == 2:
        result = 'non satisfied'
    elif col == 3:
        result = 'mixed'
    else:
        result = 'satisfied'

    return result


def english_scores(col):
    if col == 1:
        result = 'non satisfied'
    elif col == 2:
        result = 'mixed'
    else:
        result = 'satisfied'

    return result

In [6]:
df1['opinion'] = df1['stars'].apply(french_scores)

df2['opinion'] = df2['stars'].apply(english_scores)

In [7]:
df1['opinion'].value_counts()

opinion
non satisfied    1000
satisfied        1000
mixed             500
Name: count, dtype: int64

In [8]:
df2['opinion'].value_counts()

opinion
satisfied        1500
mixed             500
non satisfied     500
Name: count, dtype: int64

In [9]:
grpfr1 = df1[df1['opinion'] == 'satisfied'].sample(500)
grpfr2 = df1[df1['opinion'] == 'non satisfied'].sample(500)
grpfr3 = df1[df1['opinion'] == 'mixed']

grpen1 = df2[df2['opinion'] == 'satisfied'].sample(500)
grpen2 = df2[df2['opinion'] == 'non satisfied']
grpen3 = df2[df2['opinion'] == 'mixed']

newdata = pd.concat([grpfr1, grpfr2, grpfr3, grpen1, grpen2, grpen3], ignore_index=True)
newdata = shuffle(newdata)

In [10]:
newdata['opinion'].value_counts()

opinion
mixed            1000
non satisfied    1000
satisfied        1000
Name: count, dtype: int64

In [11]:
newdata['language'].value_counts()

language
en    1500
fr    1500
Name: count, dtype: int64

In [12]:
desc_length = []
descs = []

for line in newdata.review_body:
    desc_length.append(len(line))
    descs.append(line)

newdata['desc_length'] = desc_length


title_length = []
titles = []

for line in newdata.review_title:
    title_length.append(len(line))
    titles.append(line)

newdata['title_length'] = title_length

comments = []
for i in range(len(descs)):
    comments.append(str(titles[i] + ' ' + str(descs[i])))
                
newdata['comment'] = comments

In [13]:
newdata = newdata.rename(columns={'stars': 'notes', 'review_body': 'body', 'review_title': 'title', 'language': 'lang', 'product_category':'product', 'desc_length': 'bodylen', 'title_length': 'titlelen'})

In [14]:
newdata.reset_index(drop=True, inplace=True)

In [17]:
def remove_punctuation(text):
    translation_table = str.maketrans("", "", string.punctuation)
    text_without_punctuation = text.translate(translation_table)
    return text_without_punctuation

newdata['comment'] = newdata['comment'].apply(remove_punctuation)

In [18]:
newdata['comment'] = newdata['comment'].str.lower()

def remove_emoji(strings):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', strings)

newdata['comment'] = newdata['comment'].apply(remove_emoji)

In [19]:
map = {'non satisfied': 0, 'mixed': 1, 'satisfied': 2}

newdata['score'] = newdata['opinion'].map(lambda x: map[x])

In [20]:
newdata.head()

Unnamed: 0,review_id,product_id,reviewer_id,notes,body,title,lang,product,opinion,bodylen,titlelen,comment,score
0,en_0947109,product_en_0690907,reviewer_en_0426954,2,This came and broke apart in the box. Thankful...,This came and broke apart in the box. Thankful...,en,home,mixed,139,52,this came and broke apart in the box thankfull...,1
1,en_0815926,product_en_0890680,reviewer_en_0412863,1,Smells pretty good but I won't be using it bec...,Fell apart in shipping,en,grocery,non satisfied,168,22,fell apart in shipping smells pretty good but ...,0
2,en_0961690,product_en_0622829,reviewer_en_0138790,5,Great server cue and delivery,Delivery quick and easy,en,beauty,satisfied,29,23,delivery quick and easy great server cue and d...,2
3,en_0457731,product_en_0574275,reviewer_en_0687718,1,Part of the front of the mug was all blurred o...,Bad print job I guess,en,kitchen,non satisfied,133,21,bad print job i guess part of the front of the...,0
4,en_0102544,product_en_0773298,reviewer_en_0060721,4,Two of my favorite authors back together again...,Two of my favorite authors back together again...,en,digital_ebook_purchase,satisfied,2122,62,two of my favorite authors back together again...,2


In [22]:
newdata.to_csv('../data/cleaned/test.csv')

### Features : body and title merged in comment