In [1]:
import contractions
import emoji
import string 
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sn
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud
from google_trans_new import google_translator
from lingua import Language, LanguageDetectorBuilder


nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JoeTe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JoeTe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JoeTe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\JoeTe\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
detector = LanguageDetectorBuilder.from_all_languages().build()

# Using generic python library is not very accurate and google translate API is too slow
def detect_non_english(text):
    description_detection_1 = detector.detect_language_of(text[0]) # Returns top 2 detected language

    if(description_detection_1 != None): # Return description detection_1 since only 1 language was detected
        return description_detection_1


    if(description_detection_1 == None): # Check title for cross reference on language if description language is unknown
        
        title_detection_1= detector.detect_language_of(text[1])

        if(title_detection_1 == None): # Both are None
            return "Unknown"
        
        elif(title_detection_1 != None):
            return title_detection_1

    return "Unknown"
    

In [3]:
# Remove emojis 
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF" 
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def remove_stopwords(reviews):
    STOPWORDS = stopwords.words('english')
    STOPWORDS.remove('not')
    STOPWORDS.remove('is')
    STOPWORDS.remove('but')
    if STOPWORDS is None:
        STOPWORDS = set(stopwords.words('english'))
    return ' '.join([word for word in reviews.split() if word not in STOPWORDS])

def remove_extra_whitespace(reviews):
    return " ".join(reviews.split())

def get_wordnet_pos(text):
    # Map POS tag to first character lemmatize() accepts
    tags = nltk.pos_tag(text)
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    tags = [tag_dict.get(tag[1][0],  wordnet.NOUN) for tag in tags]
    return tags

def lemmaSentence(reviews):
    lemmatizer = WordNetLemmatizer()
    lemma_text = ''
    tok_text = word_tokenize(reviews)
    tags = get_wordnet_pos(tok_text)
    for i in range(len(tok_text)):
        lemma_text = lemma_text + ' ' + lemmatizer.lemmatize(tok_text[i], tags[i])
    return lemma_text[1:]

def lower_case(review):
    
    return review.lower()

# change contraction words such sa I'm = I am, shouldn't = should not
def change_contractions(review):
    
    expanded_words = [contractions.fix(word) for word in review.split()]

    expanded_review = ' '.join(expanded_words)
    return expanded_review

# Remove Punctuations
def remove_punctuations(review):
    
    new_review = review.translate(str.maketrans('', '', string.punctuation))
    return new_review

# Remove numbers
def remove_numbers(review):
    
    mapping = str.maketrans('', '', string.digits)
    new_review = review.translate(mapping)
    
    return new_review

In [4]:

def clean_text(data):

    data['reviewDescription'] = data['reviewDescription'].apply(lower_case)
    data['reviewDescription'] = data['reviewDescription'].apply(change_contractions)
    data['reviewDescription'] = data['reviewDescription'].apply(remove_emojis)
    data['reviewDescription'] = data['reviewDescription'].apply(remove_punctuations)
    data['reviewDescription'] = data['reviewDescription'].apply(remove_numbers)
    data['reviewDescription'] = data['reviewDescription'].apply(remove_stopwords)
    data['reviewDescription'] = data['reviewDescription'].apply(remove_extra_whitespace)
    data['reviewDescription'] = data['reviewDescription'].apply(lemmaSentence)
    
    return data

In [5]:
raw_dataset = pd.read_csv("raw_dataset/raw_books_dataset.csv")

raw_neg_dataset = pd.read_csv("raw_dataset/final_books_dataset_duplicates_removed_negative.csv")

# We concat more negative ratings to have a balance os positive and negative ratings

data_df = pd.concat([raw_dataset, raw_neg_dataset]).drop_duplicates()
data_df

Unnamed: 0.1,Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category
0,0.0,451524934,1.0,Boomer Pornography,39 people found this helpful,Orwell certainly takes an interesting approach...,True,humor_entertainment
1,1.0,451524934,1.0,THIS IS NOT A HARD COVER!!!,One person found this helpful,I was very excited to buy this book because my...,True,humor_entertainment
2,2.0,451524934,1.0,L E S S GOVERNMENT,One person found this helpful,I am trying to digest this horrible book. Its...,True,humor_entertainment
3,3.0,451524934,1.0,"Spoiler free. In summation, awful.",4 people found this helpful,The audiobook was well narrated. That’s about ...,True,humor_entertainment
4,4.0,451526341,1.0,"""Free"" book sold for $3.99",,I purchased several e-books for use in plannin...,True,humor_entertainment
...,...,...,...,...,...,...,...,...
6080,,1250316774,1.0,Just awful.,,"Stupid plot, insipid characters. Nothing about...",True,children
6081,,1250316774,1.0,So much blablabla,,Don’t like the type of book that don’t focus j...,True,children
6082,,1250316774,1.0,"boring, predictable, 2 dimensional characters",,I should have read more reviews before purchas...,False,children
6083,,1250316774,1.0,Cringe,1.0,Characters taken right from a terrible rejecte...,True,children


In [6]:
data_df = data_df.reset_index()
data_df.drop(columns = ['index', 'Unnamed: 0'], inplace=True)
data_df

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category
0,451524934,1.0,Boomer Pornography,39 people found this helpful,Orwell certainly takes an interesting approach...,True,humor_entertainment
1,451524934,1.0,THIS IS NOT A HARD COVER!!!,One person found this helpful,I was very excited to buy this book because my...,True,humor_entertainment
2,451524934,1.0,L E S S GOVERNMENT,One person found this helpful,I am trying to digest this horrible book. Its...,True,humor_entertainment
3,451524934,1.0,"Spoiler free. In summation, awful.",4 people found this helpful,The audiobook was well narrated. That’s about ...,True,humor_entertainment
4,451526341,1.0,"""Free"" book sold for $3.99",,I purchased several e-books for use in plannin...,True,humor_entertainment
...,...,...,...,...,...,...,...
30409,1250316774,1.0,Just awful.,,"Stupid plot, insipid characters. Nothing about...",True,children
30410,1250316774,1.0,So much blablabla,,Don’t like the type of book that don’t focus j...,True,children
30411,1250316774,1.0,"boring, predictable, 2 dimensional characters",,I should have read more reviews before purchas...,False,children
30412,1250316774,1.0,Cringe,1.0,Characters taken right from a terrible rejecte...,True,children


In [7]:
data_df.to_csv('raw_dataset/raw_books_dataset.csv')

In [8]:
# Remove the non_english sentences 
data_df["reviewDescription"] = data_df["reviewDescription"].astype(str)
data_df['reviewTitle'] = data_df['reviewTitle'].astype(str)
data_df["languages"] = data_df[["reviewDescription", "reviewTitle"]].apply(detect_non_english, axis=1)

In [9]:
non_english_df = data_df[data_df["languages"]!=Language.ENGLISH]
non_english_df

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category,languages
175,1984806734,1.0,Mal empastado,,Está mal empastado vienen las primeras hojas m...,True,romance,Language.SPANISH
182,1984806734,1.0,Bitte bei Erhalt genau anschauen!!!,2 people found this helpful,Das Buch wurde verschenkt. Jetzt wollte diejen...,True,romance,Language.GERMAN
186,1984806734,1.0,Decepcionada,2 people found this helpful,Así me he sentido tras leerlo. No lo recomenda...,True,romance,Language.SPANISH
189,1984806734,1.0,Delusione. Noioso. Il marketing per questo lib...,,Mi sento presa in giro dalla copertina (così s...,True,romance,Language.ITALIAN
190,1984806734,1.0,Gebraucht - Akzeptabel,,Unter Akzeptabel verstehe ich etwas anderes. D...,True,romance,Language.GERMAN
...,...,...,...,...,...,...,...,...
30323,1250316774,1.0,bla-bla-bla,,Ennuyeux au possible: beaucoup de mots pour re...,True,children,Language.FRENCH
30330,1250316774,1.0,Das Buch fängt gut an.das wars dann aber auch ...,1.0,- Die Charaktere sind flach und stereotyp. Der...,True,children,Language.GERMAN
30363,1250316774,1.0,Ok,1.0,Ok,True,children,Language.ZULU
30372,1250316774,1.0,Paperback é muito ruim,1.0,Além de ser um preço absurdo o paperback é hor...,True,children,Language.PORTUGUESE


In [10]:
data_df[data_df["languages"]==Language.ENGLISH].to_csv('raw_dataset/raw_dataset_english.csv')
data_english_df = data_df[data_df["languages"]==Language.ENGLISH].copy()

In [11]:
from transformers import pipeline
import pytorch_pretrained_bert as ppb
assert 'bert-large-cased' in ppb.modeling.PRETRAINED_MODEL_ARCHIVE_MAP
classifier_pipeline = pipeline ("zero-shot-classification", model = "facebook/bart-large-mnli")
from ipywidgets import IntProgress

In [12]:
from sklearn.model_selection import train_test_split

# Split train and test dataset. Test dataset will be self_annotated
train_df, test_df = train_test_split(data_english_df, test_size=0.1, random_state=42)

test_df.to_csv('train_test_dataset/raw_test.csv')

In [13]:
from tqdm import tqdm
label = ['positive', 'neutral', 'negative']

# Annotation using zero shot classification on training dataset
tqdm.pandas()
train_df['score_description'] = train_df['reviewDescription'].progress_apply(lambda review: classifier_pipeline(review, label))

100%|██████████| 26987/26987 [8:33:40<00:00,  1.14s/it]   


In [15]:
train_df['dict_score']  = train_df['score_description'].apply(lambda score_dict: {score_dict['labels'][0]:score_dict['scores'][0] ,  score_dict['labels'][1]: score_dict['scores'][1], score_dict['labels'][2]: score_dict['scores'][2]} )
train_df

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category,languages,score_description,dict_score
356,399587683,1.0,The writing is terrible,,"I'm only a few chapters into this book, but I'...",False,romance,Language.ENGLISH,{'sequence': 'I'm only a few chapters into thi...,"{'negative': 0.8259726762771606, 'neutral': 0...."
23039,1984806734,5.0,Favorite book of 2020 so far,,Much more than a typical romance book. Has a d...,True,romance,Language.ENGLISH,{'sequence': 'Much more than a typical romance...,"{'positive': 0.9888588190078735, 'neutral': 0...."
26580,133821666X,2.0,It reads like fan fiction written by a teenager,1,This book is disappointing. I'm a huge Harry P...,True,children,Language.ENGLISH,{'sequence': 'This book is disappointing. I'm ...,"{'negative': 0.9155025482177734, 'neutral': 0...."
18152,1484707230,5.0,"Really good book, teaches you some greek mytho...",One person found this helpful,This review is by my 9-year-old son Anay.One r...,True,children,Language.ENGLISH,{'sequence': 'This review is by my 9-year-old ...,"{'positive': 0.9080491065979004, 'neutral': 0...."
27230,399255370,1.0,hard pass,1.0,This is not a fun book to read to kids. Its go...,False,children,Language.ENGLISH,{'sequence': 'This is not a fun book to read t...,"{'negative': 0.9974750876426697, 'neutral': 0...."
...,...,...,...,...,...,...,...,...,...,...
30221,425284700,1.0,Questionable content,,*Some spoilers* I get that this book was based...,False,children,Language.ENGLISH,{'sequence': '*Some spoilers* I get that this ...,"{'negative': 0.9227846264839172, 'neutral': 0...."
5475,425284700,1.0,I really wanted to read this book. I downloade...,,I really wanted to read this book. I downloade...,False,children,Language.ENGLISH,{'sequence': 'I really wanted to read this boo...,"{'negative': 0.7753547430038452, 'neutral': 0...."
871,B019MMUA8S,1.0,A book about prioritizating your life,44,Basically tells us to really think about what ...,True,humor_entertainment,Language.ENGLISH,{'sequence': 'Basically tells us to really thi...,"{'negative': 0.9467334747314453, 'neutral': 0...."
16012,670062510,5.0,nice new book,,son reading book in 7th grade and wanted a cop...,True,children,Language.ENGLISH,{'sequence': 'son reading book in 7th grade an...,"{'positive': 0.850021481513977, 'neutral': 0.1..."


In [16]:
def find_best_polarity(data):

    best_polarity_score = max(data['positive'], data['neutral'], data['negative'])

    if(best_polarity_score == data["positive"]):
        return 1

    if(best_polarity_score == data["neutral"]):
        return 0

    if(best_polarity_score == data["negative"]):
        return -1

    
train_df['polarity'] = train_df['dict_score'].apply(find_best_polarity)

# Remove unwanted columns 
train_df.drop(columns = ["score_description", "dict_score"], inplace = True)
train_df

Unnamed: 0,productAsin,ratingScore,reviewTitle,reviewReaction,reviewDescription,isVerified,category,languages,polarity
356,399587683,1.0,The writing is terrible,,"I'm only a few chapters into this book, but I'...",False,romance,Language.ENGLISH,-1
23039,1984806734,5.0,Favorite book of 2020 so far,,Much more than a typical romance book. Has a d...,True,romance,Language.ENGLISH,1
26580,133821666X,2.0,It reads like fan fiction written by a teenager,1,This book is disappointing. I'm a huge Harry P...,True,children,Language.ENGLISH,-1
18152,1484707230,5.0,"Really good book, teaches you some greek mytho...",One person found this helpful,This review is by my 9-year-old son Anay.One r...,True,children,Language.ENGLISH,1
27230,399255370,1.0,hard pass,1.0,This is not a fun book to read to kids. Its go...,False,children,Language.ENGLISH,-1
...,...,...,...,...,...,...,...,...,...
30221,425284700,1.0,Questionable content,,*Some spoilers* I get that this book was based...,False,children,Language.ENGLISH,-1
5475,425284700,1.0,I really wanted to read this book. I downloade...,,I really wanted to read this book. I downloade...,False,children,Language.ENGLISH,-1
871,B019MMUA8S,1.0,A book about prioritizating your life,44,Basically tells us to really think about what ...,True,humor_entertainment,Language.ENGLISH,-1
16012,670062510,5.0,nice new book,,son reading book in 7th grade and wanted a cop...,True,children,Language.ENGLISH,1


In [17]:
# Check if pos & neg are balanced
train_df["polarity"].value_counts()

 1    13579
-1    13168
 0      240
Name: polarity, dtype: int64

In [18]:
train_df.to_csv("train_test_dataset/train_annotated.csv")