In [9]:
import pandas as pd
import numpy as np
import fasttext
from sklearn.model_selection import train_test_split

### The follow script constructs a training dataset to fine-tune fast text for product review classifier

In [3]:
DATA = pd.read_csv(r'fashion_train/18.csv')


In [17]:
# filtering
DATA_select = DATA[['reviewText','overall']]
DATA_select

Unnamed: 0,reviewText,overall
0,Just what I needed for a party,1.0
1,Son loves it. Decent quality.,1.0
2,My son wanted to dress like August Pullman fro...,1.0
3,awesome.,1.0
4,My daughter loves this item. My high score is ...,1.0
...,...,...
50372,What I wanted.,1.0
50373,"Husband loves it, as he always dreamed of goin...",1.0
50374,"Ugh. This shirt looked so good online, it look...",-1.0
50375,Very comfortable to wear,1.0


In [19]:
DATA_select.isnull().sum()

reviewText    0
overall       0
dtype: int64

In [27]:
print(f"Positive Review: {DATA_select[DATA_select['overall']==1].count()}\n")
print(f"Neutral Review: {DATA_select[DATA_select['overall']==0].count()}\n")
print(f"Negative Review: {DATA_select[DATA_select['overall']==-1].count()}\n")

Positive Review: reviewText    36411
overall       36411
dtype: int64

Neutral Review: reviewText    4868
overall       4868
dtype: int64

Negative Review: reviewText    9098
overall       9098
dtype: int64



In [29]:
DATA2 = pd.read_csv(r'fashion_train/12.csv')
DATA2

Unnamed: 0.1,Unnamed: 0,overall,reviewTime,reviewText
0,13,-1.0,2012,"I ordered a ladies cigarette case for my wife,..."
1,14,1.0,2010,Received the case in less than a week...was ve...
2,127,-1.0,2012,Don't buy this shirt unless you plan to spend ...
3,128,1.0,2012,"Nice shirt, Well made. good sizing. Nice color..."
4,139,1.0,2011,If you want a look that takes it back to the O...
...,...,...,...,...
16795,792655,1.0,2011,I purchased this hat in anticipation of the ho...
16796,795476,1.0,2010,"These toddler boots were very cute, comfortabl..."
16797,795477,1.0,2010,My daughter loves them. They are for a baby gi...
16798,796077,-1.0,2009,Looks like it came out of a vending machine. C...


In [31]:
DATA2_select = DATA2[['reviewText','overall']]
DATA2_select

Unnamed: 0,reviewText,overall
0,"I ordered a ladies cigarette case for my wife,...",-1.0
1,Received the case in less than a week...was ve...,1.0
2,Don't buy this shirt unless you plan to spend ...,-1.0
3,"Nice shirt, Well made. good sizing. Nice color...",1.0
4,If you want a look that takes it back to the O...,1.0
...,...,...
16795,I purchased this hat in anticipation of the ho...,1.0
16796,"These toddler boots were very cute, comfortabl...",1.0
16797,My daughter loves them. They are for a baby gi...,1.0
16798,Looks like it came out of a vending machine. C...,-1.0


In [33]:
DATA2_select.isnull().sum()

reviewText    0
overall       0
dtype: int64

In [47]:
DATA3 = pd.read_csv(r'fashion_train/13_17.csv')
DATA3_select = DATA3[['reviewText','overall']]

In [49]:
DATA3_select.isnull().sum()

reviewText    1
overall       0
dtype: int64

In [51]:
DATA3_select.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DATA3_select.dropna(inplace=True)


In [53]:
DATA3_select.isnull().sum()

reviewText    0
overall       0
dtype: int64

In [55]:
DATA_ALL = pd.concat([DATA_select, DATA2_select, DATA3_select])

#### Construct the Training an Testing set
- We want a training set =  180000 and testing set = 60000 comments

In [75]:
total = 240000

positive_reviews = DATA_ALL[DATA_ALL['overall'] == 1]
neutral_reviews = DATA_ALL[DATA_ALL['overall'] == 0]
negative_reviews = DATA_ALL[DATA_ALL['overall'] == -1]



train_per_class = round(total*0.8/3) 
test_per_class = round(total*0.2/3)
print(f"training instances per class: {train_per_class}, and testing instances per class{test_per_class}")


# Training set
positive_train = positive_reviews.sample(n=train_per_class, random_state=42)
neutral_train = neutral_reviews.sample(n=train_per_class, random_state=42)
negative_train = negative_reviews.sample(n=train_per_class, random_state=42)


training_set = pd.concat([positive_train, neutral_train, negative_train])


# Testing set
positive_remaining = positive_reviews.drop(positive_train.index)
neutral_remaining = neutral_reviews.drop(neutral_train.index)
negative_remaining = negative_reviews.drop(negative_train.index)

positive_test = positive_remaining.sample(n=test_per_class, random_state=42)
neutral_test = neutral_remaining.sample(n=test_per_class, random_state=42)
negative_test = negative_remaining.sample(n=test_per_class, random_state=42)

testing_set = pd.concat([positive_test, neutral_test, negative_test])

# Shuffle!!
training_set = training_set.sample(frac=1, random_state=42).reset_index(drop=True)
testing_set = testing_set.sample(frac=1, random_state=42).reset_index(drop=True)


training instances per class: 64000, and testing instances per class16000


In [77]:
testing_set

Unnamed: 0,reviewText,overall
0,Terrible. The wig doesn't even fit my daughter...,-1.0
1,This is not a youth or big boys size!,-1.0
2,flames come off. we have had 2 pairs & it has ...,-1.0
3,Great bag!! Larger than I thought but I will j...,1.0
4,The glasses fit pretty tight so they don't fal...,1.0
...,...,...
47995,Precious little purse. It is as shown.,1.0
47996,"Not even the right shirt, and crappy quality t...",-1.0
47997,If there was a no star rating I would use that...,-1.0
47998,Love it! Material its soft and it looks so sexy!,1.0


In [99]:
# clean the texts in both sets

def preprocess_text(text, lemmatizer, stop_words):
    """
    This function does the following:
    1. Lowercasing
    2. Removing punctuation and numbers
    3. Tokenization
    4. Removing stopwords
    5. Lemmatization
    6. Removing extra whitespace
    """
    

    text = text.lower()
    
    text = re.sub(r'[\d]', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    
    tokens = nltk.word_tokenize(text)
    
    tokens = [word for word in tokens if word not in stop_words]
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    cleaned = ' '.join(tokens)
    
    return cleaned



In [79]:
import nltk
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [93]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


# Remove the words that are key to sentiment analysis (mostly the negative contractions)
negatives = {'not', 'no', 'never', 'none', 'nor', 'neither', 'cannot', "don't", "didn't", "won't", "can't", "aren't", "couldn't", "doesn't", "hasn't", "shoudn't", "haven't", "shan't" }
negatives2 = {
    "against", "ain't", "aren't", "couldn't", "didn't", "doesn't", "don't", 
    "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "needn't", 
    "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't", 
    "how", "just", "more", "most", "only", "other", "same", "such", 
    "too", "very", "what", "when", "where", "why"
}
negatives3 =  {
    "ain", "aren", "couldn", "didn", "doesn", "don", "hadn", "hasn", "haven", 
    "isn", "mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", 
    "won", "wouldn"
}

stop_words = stop_words - negatives - negatives2 - negatives3

In [95]:
def clean_review(row):
 
    review = row['reviewText']
    clean_text = preprocess_text(review, lemmatizer, stop_words)
    row['Text'] = clean_text
    return row[['overall', 'Text']]

In [101]:
clean_train = training_set.apply(clean_review, axis=1)
clean_test = testing_set.apply(clean_review, axis=1)

In [103]:
clean_test

Unnamed: 0,overall,Text
0,-1.0,terrible wig doesnt even fit daughter american...
1,-1.0,not youth big boy size
2,-1.0,flame come pair happened one pair within day b...
3,1.0,great bag larger thought just fill
4,1.0,glass fit pretty tight dont fall lean very lig...
...,...,...
47995,1.0,precious little purse shown
47996,-1.0,not even right shirt crappy quality boot
47997,-1.0,no star rating would use shirt seemed good was...
47998,1.0,love material soft look sexy


In [109]:
clean_train.to_csv(r"model_train_data/clean_train.csv", index=False)
clean_test.to_csv(r"model_train_data/clean_test.csv", index=False)

### Fine-tuning the Fasttext classifier

In [117]:
import fasttext
import os

In [121]:
def DF_ftz(dataframe, label_tag, text_tag, train):
    """
    Format the texts for FastText model.
    """
    if train:
        label = "Train"
    else:
        label = "Test"
    formatted_file = r"model_train_data/" + label + "_review_data.txt"
    with open(formatted_file, "w", encoding="utf-8") as f:
        for _, row in dataframe.iterrows():
            label = f"__label__{row[label_tag]}"
            text = row[text_tag]
            f.write(f"{label} {text}\n")
    return formatted_file
    

In [601]:
def train(corpus):
    """
    Train and fine-tune FastText model
    """
    # Train model on training data
    model = fasttext.train_supervised(input=corpus,
                                      epoch=50,
                                      lr=0.003,
                                      wordNgrams=2,
                                      bucket=200000,
                                      dim=50,
                                      loss="softmax")

    model.save_model("clothes_review_sentiment.ftz")
    return model


def eval(model, corpus):
    """
    Evaluate the trained FastText model.
    """
    result = model.test(corpus)
    
    print(f"Test Samples: {result[0]}")
    print(f"Precision: {result[1]}")
    print(f"Recall: {result[2]}")


In [603]:
#train_corpus = DF_ftz(clean_train, "overall", "Text", True)
zero_shot_model = train(train_corpus)

In [604]:
#test_corpus = DF_ftz(clean_test, "overall", "Text", False)
eval(zero_shot_model, test_corpus)

Test Samples: 48000
Precision: 0.7359375
Recall: 0.7359375
