In [2]:
""" Needed packages """
import os
import sys
import textwrap
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

sys.path.append(os.path.abspath(os.path.join("../scripts")))

from extractor import HuggingFaceDataLoad

In [3]:
DATASET_NAME = 'amazon_reviews_multi'
LANG = ['fr', 'en']

loader_fr, loader_en = [HuggingFaceDataLoad(DATASET_NAME, lang) for lang in LANG]

### Explore French version

In [4]:
# Get data description
loader_fr.inspectdatadescription()

Loading dataset description

Dataset description loaded in ./data/source/dataset_desc_fr.txt


In [5]:
# Read description
with open('../data/source/dataset_desc_fr.txt', 'r') as file:
    fr_desc = file.read()
print(textwrap.fill(fr_desc, width=80))

We provide an Amazon product reviews dataset for multilingual text
classification. The dataset contains reviews in English, Japanese, German,
French, Chinese and Spanish, collected between November 1, 2015 and November 1,
2019. Each record in the dataset contains the review text, the review title, the
star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-
grained product category (e.g. ‘books’, ‘appliances’, etc.) The corpus is
balanced across stars, so each star rating constitutes 20% of the reviews in
each language.  For each language, there are 200,000, 5,000 and 5,000 reviews in
the training, development and test sets respectively. The maximum number of
reviews per reviewer is 20 and the maximum number of reviews per product is 20.
All reviews are truncated after 2,000 characters, and all reviews are at least
20 characters long.  Note that the language of a review does not necessarily
match the language of its marketplace (e.g. reviews from amazon.de are p

In [6]:
# Get features
fr_features = loader_fr.inspectdatafeatures()
fr_features

{'review_id': Value(dtype='string', id=None),
 'product_id': Value(dtype='string', id=None),
 'reviewer_id': Value(dtype='string', id=None),
 'stars': Value(dtype='int32', id=None),
 'review_body': Value(dtype='string', id=None),
 'review_title': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'product_category': Value(dtype='string', id=None)}

In [7]:
# Get data
data_fr = loader_fr.getdata('train')

Found cached dataset amazon_reviews_multi (/home/jds98/.cache/huggingface/datasets/amazon_reviews_multi/fr/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


In [8]:
type(data_fr)

datasets.arrow_dataset.Dataset

In [9]:
df1 = data_fr.to_pandas()
print(df1.shape)
df1.head()

(200000, 8)


Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,fr_0424335,product_fr_0297678,reviewer_fr_0961886,1,A déconseiller - Article n'a fonctionné qu'une...,Brumisateur à pompe,fr,beauty
1,fr_0452615,product_fr_0613288,reviewer_fr_0857499,1,Si vous voulez être déçu achetez le produit ! ...,Insatisfaisant,fr,baby_product
2,fr_0407673,product_fr_0571250,reviewer_fr_0383240,1,"Écran de mauvaise qualité, car il s'use en peu...",Ne recommande pas,fr,pc
3,fr_0579191,product_fr_0030168,reviewer_fr_0729693,1,Cet engin ne sert à rien les sons sont pourris...,A éviter!,fr,musical_instruments
4,fr_0931533,product_fr_0468261,reviewer_fr_0734066,1,Très beau produit mais la grue n'a pas fonctio...,Déçue,fr,toy


In [10]:
df1.to_csv('../data/source/french.csv', index=False)

### Explore English version

In [11]:
# Get data description
loader_en.inspectdatadescription()

Loading dataset description

Dataset description loaded in ./data/source/dataset_desc_en.txt


In [12]:
# Read description
with open('../data/source/dataset_desc_en.txt', 'r') as file:
    en_desc = file.read()
print(textwrap.fill(en_desc, width=80))

We provide an Amazon product reviews dataset for multilingual text
classification. The dataset contains reviews in English, Japanese, German,
French, Chinese and Spanish, collected between November 1, 2015 and November 1,
2019. Each record in the dataset contains the review text, the review title, the
star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-
grained product category (e.g. ‘books’, ‘appliances’, etc.) The corpus is
balanced across stars, so each star rating constitutes 20% of the reviews in
each language.  For each language, there are 200,000, 5,000 and 5,000 reviews in
the training, development and test sets respectively. The maximum number of
reviews per reviewer is 20 and the maximum number of reviews per product is 20.
All reviews are truncated after 2,000 characters, and all reviews are at least
20 characters long.  Note that the language of a review does not necessarily
match the language of its marketplace (e.g. reviews from amazon.de are p

In [13]:
# Get features
en_features = loader_en.inspectdatafeatures()
en_features

{'review_id': Value(dtype='string', id=None),
 'product_id': Value(dtype='string', id=None),
 'reviewer_id': Value(dtype='string', id=None),
 'stars': Value(dtype='int32', id=None),
 'review_body': Value(dtype='string', id=None),
 'review_title': Value(dtype='string', id=None),
 'language': Value(dtype='string', id=None),
 'product_category': Value(dtype='string', id=None)}

In [14]:
# Get data
data_en = loader_en.getdata('train')

Found cached dataset amazon_reviews_multi (/home/jds98/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


In [15]:
df2 = data_en.to_pandas()
print(df2.shape)
df2.head()

(200000, 8)


Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,en_0964290,product_en_0740675,reviewer_en_0342986,1,Arrived broken. Manufacturer defect. Two of th...,I'll spend twice the amount of time boxing up ...,en,furniture
1,en_0690095,product_en_0440378,reviewer_en_0133349,1,the cabinet dot were all detached from backing...,Not use able,en,home_improvement
2,en_0311558,product_en_0399702,reviewer_en_0152034,1,I received my first order of this product and ...,The product is junk.,en,home
3,en_0044972,product_en_0444063,reviewer_en_0656967,1,This product is a piece of shit. Do not buy. D...,Fucking waste of money,en,wireless
4,en_0784379,product_en_0139353,reviewer_en_0757638,1,went through 3 in one day doesn't fit correct ...,bubble,en,pc


In [16]:
df2.to_csv('../data/source/english.csv', index=False)

### Sampling

In [17]:
def n_sampling(df:pd.DataFrame, nb):
    """
    Sample a dataframe based on given conditions
    """
    dataset = pd.DataFrame()
    for score in df.stars.value_counts().index:
        mask = df.stars == score
        sample = shuffle(df[mask], random_state=1).sample(nb)
        dataset = pd.concat([dataset, sample])
    
    dataset = shuffle(dataset)
    return dataset

In [18]:
df1 = n_sampling(df1, 2000)
df2 = n_sampling(df2, 2000)

df = pd.concat([df1, df2])
df = shuffle(df)

df.to_csv('../data/source/merged_dataset.csv', index=False)

### Data Understanding and cleaning

In [19]:
def wrangle(filepath):

    df = pd.read_csv(filepath)
    df.drop(columns=['review_id', 'product_id', 'reviewer_id'], inplace=True)

    return df

In [20]:
df = wrangle('../data/source/merged_dataset.csv')

In [21]:
df.shape

(20000, 5)

In [22]:
df.head()

Unnamed: 0,stars,review_body,review_title,language,product_category
0,4,"Works good, had to cut the bottom ring because...",Its nice,en,home_improvement
1,2,As amazing as these charge very quickly. I hav...,As amazing as these charge very quickly,en,wireless
2,5,Parfait pour remplir une piñata et les sachets...,Très bon rapport qualité prix,fr,toy
3,4,Article conforme à la description parfait pour...,Top,fr,apparel
4,4,Used for decor in my wedding & really loved th...,Used for decor in my wedding & really loved th...,en,lawn_and_garden


In [23]:
df.columns

Index(['stars', 'review_body', 'review_title', 'language', 'product_category'], dtype='object')

In [24]:
df.isnull().sum()

stars               0
review_body         0
review_title        0
language            0
product_category    0
dtype: int64

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   stars             20000 non-null  int64 
 1   review_body       20000 non-null  object
 2   review_title      20000 non-null  object
 3   language          20000 non-null  object
 4   product_category  20000 non-null  object
dtypes: int64(1), object(4)
memory usage: 781.4+ KB


In [27]:
df.duplicated().sum()

1

In [28]:
df.to_csv('../data/cleaned/dataset.csv', index=False)