In [1]:
import re
import string
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.base import BaseEstimator, TransformerMixin

import spacy
import spacy.attrs
nlp = spacy.load("en_core_web_sm")


Load datasets:

In [12]:
dataset_train = pd.read_csv("../data/book_review_labelled_data.csv")
dataset_test = pd.read_csv("../data/book_review_test_data_unlabelled.csv")


## 1. Filtering


In [13]:
dataset_train = dataset_train.dropna()
dataset_train.head()

Unnamed: 0,reviewerID,reviewerName,reviewText,overall,summary,reviewTime,rates_count,helpful_count,rating
0,A3UPFTGAWZ3G2R,David J. Loftus,"Jenkins, a history professor and Member of Par...",4,"Quite readable, nicely done","12 6, 2001",40,37,4
1,A1XTKTLNSCRLDS,Ellen Rappaport,Detective Inspector Erlendur Sveinsson is at h...,5,Mesmerizing in depth,"02 23, 2014",0,0,5
2,A1A77B6DQQH436,"crescamp ""esc""",I didn't read this. I purchased it for a gift...,3,10-minute life lessons for kids,"02 12, 2013",3,0,3
3,AEAF4MRYHJZI,"Angelia Menchan ""acvermen.blogspot.com""",Fierce Angels by Sheri Park reads like a disse...,4,So FIERCE,"03 24, 2010",9,9,4
4,A3B7KU72LGWFER,"Grifel ""Tea Time""",Clearly this author had two goals in mind: 1) ...,1,Drivel!,"06 21, 2003",19,13,1


As noticed before, we can exclude reviews that were not rated as helpful:

## 2. Text Processing


In [14]:
dataset_filtered = dataset_train[dataset_train.rates_count != 0]



### Text preprocessing


In [15]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, regex_list, lower=True, remove_punct=True, stopwords=[]):
        self.tokenizer = tokenizer
        self.stopwords = stopwords
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
        
        if self.stopwords:
            words = list(filter(lambda x: x not in self.stopwords, words))
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self


In [16]:
tokenizer = WordPunctTokenizer()
regex_list = [("<[^>]*>")]

def process_dataset_text(df, tokenizer, regex_list, stopwords):
    cleaner = TextCleanerTransformer(tokenizer, regex_list, stopwords)
    
    df_new = df.copy()
    df_new['summaryPreprocessed'] = cleaner.transform(df_new.summary.values)
    df_new['reviewTextPreprocessed'] = cleaner.transform(df_new.reviewText.values)
    return df_new


In [17]:
dataset_filtered = process_dataset_text(
    dataset_filtered, 
    tokenizer, 
    regex_list=[], 
    stopwords=stopwords.words('english')
)


In [18]:
test_dataset_filtered = process_dataset_text(
    dataset_test, 
    tokenizer, 
    regex_list=[], 
    stopwords=stopwords.words('english')
)


### Adding more text features


In [19]:
### Adding some extra features
count_words = lambda doc: len(WordPunctTokenizer().tokenize(doc))
count_punctuation = lambda doc: len([word for word in WordPunctTokenizer().tokenize(doc) if word in string.punctuation])

def add_text_features(df):
    df_new = df.copy()
    df_new['len_review'] = df_new['reviewText'].map(lambda x: len(x))
    df_new['len_summary'] = df_new['summary'].map(lambda x: len(x))
    df_new['word_count_summary'] = df_new['summary'].map(lambda x: count_words(x))
    df_new['word_count_review'] = df_new['reviewText'].map(lambda x: count_words(x))
    df_new['punctuation_count_summary'] = df_new['summary'].map(lambda x: count_punctuation(x))
    df_new['punctuation_count_review'] = df_new['reviewText'].map(lambda x: count_punctuation(x))
    return df_new



In [20]:
dataset_filtered = add_text_features(
    dataset_filtered, 
)


In [21]:
test_dataset_filtered = add_text_features(
    test_dataset_filtered, 
)


### Spacy

In [31]:
def get_adj_count(doc):
    nlp_doc = nlp(doc)
    counts_dict = nlp_doc.count_by(spacy.attrs.IDS['POS'])
    return counts_dict[84] if 84 in counts_dict else 0  # 84 corresponds to 'ADJ'

def add_spacy_features(df):
    df_new = df.copy()
    df_new['review_ADJ_count'] = df_new['reviewText'].map(lambda x: get_adj_count(x))
    #df_new['summary_ADJ_count'] = df_new['summary'].map(lambda x: get_adj_count(x))
    return df_new



In [32]:
dataset_filtered = add_spacy_features(
    dataset_filtered, 
)


In [33]:
test_dataset_filtered = add_spacy_features(
    test_dataset_filtered, 
)


In [34]:
dataset_filtered.head()

Unnamed: 0,reviewerID,reviewerName,reviewText,overall,summary,reviewTime,rates_count,helpful_count,rating,summaryPreprocessed,reviewTextPreprocessed,len_review,len_summary,word_count_summary,word_count_review,punctuation_count_summary,punctuation_count_review,review_ADJ_count
0,A3UPFTGAWZ3G2R,David J. Loftus,"Jenkins, a history professor and Member of Par...",4,"Quite readable, nicely done","12 6, 2001",40,37,4,quite readable nicely done,jenkins a history professor and member of parl...,1790,27,5,363,1,69,28
2,A1A77B6DQQH436,"crescamp ""esc""",I didn't read this. I purchased it for a gift...,3,10-minute life lessons for kids,"02 12, 2013",3,0,3,10 minute life lessons for kids,i didn t read this i purchased it for a gift f...,117,31,7,28,1,2,2
3,AEAF4MRYHJZI,"Angelia Menchan ""acvermen.blogspot.com""",Fierce Angels by Sheri Park reads like a disse...,4,So FIERCE,"03 24, 2010",9,9,4,so fierce,fierce angels by sheri park reads like a disse...,1557,9,2,319,0,37,27
4,A3B7KU72LGWFER,"Grifel ""Tea Time""",Clearly this author had two goals in mind: 1) ...,1,Drivel!,"06 21, 2003",19,13,1,drivel,clearly this author had two goals in mind 1 to...,584,7,2,127,1,17,8
5,A3JD07VHDLT5FF,"isala ""Isabel and Lars""",This is a collection of stories and memories b...,5,Compelling stories by ordinary people,"03 19, 2005",7,5,5,compelling stories by ordinary people,this is a collection of stories and memories b...,1531,37,5,300,0,33,19


In [35]:
test_dataset_filtered.head()

Unnamed: 0,reviewerID,reviewerName,reviewText,summary,reviewTime,rating,summaryPreprocessed,reviewTextPreprocessed,len_review,len_summary,word_count_summary,word_count_review,punctuation_count_summary,punctuation_count_review,review_ADJ_count
0,A2HESNQJZ9OB7H,Jen,So boring and stupid had a hard time finishing...,Unbelievable.,"02 16, 2014",1,unbelievable,so boring and stupid had a hard time finishing...,634,13,2,141,1,17,15
1,A1ABXPSFA9PC8N,Ben Parker,Ill be the first to admit i'm not the best coo...,Easy and Clear Cooking,"11 7, 2012",5,easy and clear cooking,ill be the first to admit i m not the best coo...,435,22,4,91,0,8,10
2,AYVW3O6W8S5S4,Johnny in Texas,Doesn't tell you how to do anything... just s...,not bad,"02 25, 2014",3,not bad,doesn t tell you how to do anything ... just s...,203,7,2,44,0,3,3
3,A17GOTFSUAWN17,Tyson,Whenever I met my last two girlfriends in pers...,"Short book, but good primer on how to text girls","03 15, 2014",4,short book but good primer on how to text girls,whenever i met my last two girlfriends in pers...,1334,48,11,283,1,32,15
4,A2VO8K861AV83R,"Avatheps ""Avatheps""",I read the reviews and decided to take a chanc...,Very disappointing. I could not finish it.,"12 30, 2013",2,very disappointing i could not finish it,i read the reviews and decided to take a chanc...,1024,42,9,221,2,23,16


Saving datasets:

In [38]:
dataset_filtered.to_csv("train_set_preprocessed.csv", index=False)


In [39]:
test_dataset_filtered.to_csv("test_set_preprocessed.csv", index=False)
