# Load datasets

In [1]:
import pandas as pd

In [2]:
train_set = pd.read_csv('amazon_train.csv')
train_set.head(10)

Unnamed: 0,Score,Summary,Text
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...
5,5,There's a reason for the price,"There's a reason this CD is so expensive, even..."
6,1,Buyer beware,"This is a self-published book, and if you want..."
7,4,"Errors, but great story",I was a dissapointed to see errors on the back...
8,1,The Worst!,A complete waste of time. Typographical errors...
9,1,Oh please,I guess you have to be a romance novel lover f...


In [3]:
train_set.Score.value_counts()

3    600000
5    600000
4    600000
1    600000
2    600000
Name: Score, dtype: int64

In [4]:
len(train_set)

3000000

In [5]:
train_set.tail(10)

Unnamed: 0,Score,Summary,Text
2999990,5,Tyler Hilton is the best!,Tyler Hilton's EP may only have a couple songs...
2999991,1,What A Slap In The Face To Masami Ueda,Do NOT buy this cd. Ever. This was probably ju...
2999992,2,Too simplistic,While Mr. Harrison makes some extremely valid ...
2999993,5,My favorite for 30 years,I had the amazing good fortune to be acquainte...
2999994,3,great features but beware of difficulty cleaning,This high chair has some great features - stor...
2999995,1,Don't do it!!,The high chair looks great when it first comes...
2999996,2,"Looks nice, low functionality",I have used this highchair for 2 kids now and ...
2999997,2,"compact, but hard to clean","We have a small house, and really wanted two o..."
2999998,3,Hard to clean!,I agree with everyone else who says this chair...
2999999,1,what is it saying?,not sure what this book is supposed to be. It ...


In [6]:
test_set = pd.read_csv('amazon_test.csv')
test_set.head(10)

Unnamed: 0,Score,Summary,Text
0,1,mens ultrasheer,"This model may be ok for sedentary types, but ..."
1,4,Surprisingly delightful,This is a fast read filled with unexpected hum...
2,2,"Works, but not as advertised",I bought one of these chargers..the instructio...
3,2,Oh dear,I was excited to find a book ostensibly about ...
4,2,Incorrect disc!,"I am a big JVC fan, but I do not like this mod..."
5,2,Incorrect Disc,"I love the style of this, but after a couple y..."
6,2,DVD menu select problems,I cannot scroll through a DVD menu that is set...
7,3,My 2 y/o grandson loves it!!,This movie with all of its animals really keep...
8,5,A Cookbook Every Baker Should Own,I found a copy of this cookbook at a local use...
9,3,good basic,"The book is a basic ""how to"" book for using so..."


In [7]:
test_set.Score.value_counts()

1    130000
4    130000
2    130000
3    130000
5    130000
Name: Score, dtype: int64

In [8]:
len(test_set)

650000

In [9]:
test_set.tail(10)

Unnamed: 0,Score,Summary,Text
649990,3,works fine but no good surprises,I was encouraged to buy a webcam by my stepdau...
649991,4,:),i loved the piercings but i only got 18 out 20...
649992,5,The CD with the only known recording of Aleist...,"""Aleister Crowley - The Great Beast Speaks"" is..."
649993,3,"Terrific Musicians, poor writers and composers",As wonderful as it is to hear these fine music...
649994,5,New Songs Right on Target,I bought this cd for my husband and he loves i...
649995,5,Pretty Cool!,We got it for our mom's birthday. She LOVES it...
649996,5,great cd,"this cd is very good. i especially love ""cats ..."
649997,2,An interesting look into Boston's comedy clubs,This was a good documentary on the history of ...
649998,5,Du vol...pour les cowboys!,Avez-vous déjà vu un CD double et un DVD avec ...
649999,4,"A Companion Read To GUNS, GERMS, AND STEEL",If you like books that offer explanations for ...


# Preprocess texts

In [10]:
from nltk.tokenize import word_tokenize
import re
import spacy
import nltk

In [11]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#nltk.download('punkt') #remove hashtag on Google Colab
#nltk.download('stopwords') #remove hashtag on Google Colab

In [12]:
def preprocess_text(input_str):
    words = word_tokenize(input_str)
    words = [re.sub(r"[^A-Za-z@]", "", word) for word in words]
    words = [re.sub(r"\S+com", "", word) for word in words]
    words = [re.sub(r"\S+@\S+", "", word) for word in words]
    words = [word for word in words if word!=' ']

    #import nltk stopwords
    stopwords_nltk = nltk.corpus.stopwords.words('english')
    
    #import other lists of stopwords
    with open('stopwords_en.txt', 'r') as f:
        file = f.readlines()
    stopwords = []
    for word in file:
        stopwords.append(word.replace('\n', ''))

    stopwords = stopwords + stopwords_nltk
    stopwords = [s.lower() for s in stopwords]
    
    words = [token.lemma_ for token in nlp(' '.join(words)) if not token.is_punct]
    words = [word.lower() for word in words if word.lower() not in stopwords]
    
    returnList = []
    for word in words:
        if ' ' not in word:
            returnList.append(word)
    
    return ' '.join(returnList)

In [13]:
train_texts = []
test_texts = []

# remove the [:50] on a more powerful computer
for text in train_set['Text'].tolist()[:70]:
    train_texts.append(preprocess_text(text))
    
for text in test_set['Text'].tolist()[:30]:
    test_texts.append(preprocess_text(text))

In [14]:
train_texts

['give dad gag gift direct nunsense reall kick',
 'hope lot people hear cd strong positive vibe great vocal fresh tune crosscultural happiness blue gut pop sound catchy mature',
 'read lot review good game soundtrack figure write review disagree bit opinino yasunori mitsuda ultimate masterpiece music timeless listen year beauty simply refuse fadethe price tag pretty staggering buy cd money feel worth penny',
 'music yasunori misuda question close great nobuo uematsuchrono cross ost wonderful creation fill rich orchestra synthesized sound ambiance music major factor time uplifting vigorous favourite track include scar leave time girl steal stars world',
 'great soundtrack history play game enjoyable work hard soundtrack spend money worth penny ost amazing track dance delight scars leave time buy',
 'reason cd expensive version importsome good music listen track minute day',
 'selfpublished book read paragraph star review write ms haddon family friend imagine read thing spend evening boo

In [15]:
test_texts

['model sedentary type active alot job consistently find stocking roll ankle good solution standard compression stock stock excellent support stay give pair tear struggle pull time good riddancebad investment',
 'fast read fill unexpected humour profound insight art politic policy sly wry wise',
 'buy charger instruction light stay battery charge true instruction dont light turn true hour charging light stay return bad unitthe thing charge light useless stay backup charger manage drain aa charger',
 'excited find book ostensibly muslim feminism volume live expectationsone essay thing describe veil potentially liberate begin explain whyanother muslim woman cape town claim muslim woman separate equal gee whiz disappointmenti expect hope muslim feminist condemnation gender apartheid single booki surprised essay extoll virtue female genital mutilation alyssa lappen',
 'big jvc fan model suspiscious unit return section store buy happy unit send click receiver transition scene smooth pause f

# Vectorise texts

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,1))
x_train_dtm = vect.fit_transform(train_texts)
x_test_dtm = vect.transform(test_texts)

In [18]:
x_train_dtm

<70x1050 sparse matrix of type '<class 'numpy.float64'>'
	with 1657 stored elements in Compressed Sparse Row format>

In [19]:
x_test_dtm

<30x1050 sparse matrix of type '<class 'numpy.float64'>'
	with 370 stored elements in Compressed Sparse Row format>

# Build and evaluate model (Naïve Bayes)

Build model

In [20]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [21]:
# train model using x_train_dtm
%time nb.fit(x_train_dtm, train_set.head(70).Score)

CPU times: user 5.13 ms, sys: 3.2 ms, total: 8.33 ms
Wall time: 12.6 ms


MultinomialNB()

In [22]:
y_pred_class_nb = nb.predict(x_test_dtm)

Evaluate model

In [23]:
from sklearn import metrics
metrics.accuracy_score(test_set.head(30).Score, y_pred_class_nb)

0.23333333333333334