In [7]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

tqdm.pandas()

  from pandas import Panel


# Load

In [8]:
df = pd.read_csv('amazon_reviews_sample.csv')
df = df[['score', 'review']]
df.head()

Unnamed: 0,score,review
0,1,Stuning even for the non-gamer: This sound tr...
1,1,The best soundtrack ever to anything.: I'm re...
2,1,Amazing!: This soundtrack is my favorite musi...
3,1,Excellent Soundtrack: I truly like this sound...
4,1,"Remember, Pull Your Jaw Off The Floor After H..."


In [9]:
df['char_count'] = df['review'].progress_apply(lambda row: len(row))
df['word_count'] = df['review'].progress_apply(lambda row: len(row.split()))

100%|██████████| 10000/10000 [00:00<00:00, 624765.99it/s]
100%|██████████| 10000/10000 [00:00<00:00, 169467.51it/s]


In [10]:
df.describe()

Unnamed: 0,score,char_count,word_count
count,10000.0,10000.0,10000.0
mean,0.4903,441.7026,79.5532
std,0.499931,239.243635,43.023095
min,0.0,104.0,14.0
25%,0.0,241.0,43.0
50%,0.0,394.0,71.0
75%,1.0,608.0,109.0
max,1.0,1018.0,212.0


# Preprocessing

In [11]:
df.corr()

Unnamed: 0,score,char_count,word_count
score,1.0,-0.044603,-0.051935
char_count,-0.044603,1.0,0.989204
word_count,-0.051935,0.989204,1.0


In [21]:
import re
import string
import nltk
import spacy
from nltk.corpus import stopwords

nltk.download('stopwords')
english_stopwords = stopwords.words('english')
english_stopwords = set(english_stopwords)

nlp = spacy.load('en_core_web_sm')

def collapse_same_letters(row):
    row = re.sub(r'([a-z])\1{2,}', '\g<1>', row)
    return row

def remove_stop_words(row):
    words = row.split(' ')
    row = ' '.join([word for word in words if word not in english_stopwords])
    return row

def preprocessing(row):
    row = row.lower()
    row = row.replace('\n', ' ')
    row = row.replace('\t', ' ')
    
    row = collapse_same_letters(row)    
    row = remove_stop_words(row)
        
    row = re.sub(r'[^a-z ]', ' ', row)
    row = re.sub(r'[a-z]{35,}', ' ', row)
    row = re.sub(r' {2,}', ' ', row)
    row = row.strip()
    
    return row

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dmitry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
df['preprocessed'] = df['review'].progress_apply(preprocessing)
df['preprocessed'] = df['preprocessed'].progress_apply(lambda row: ' '.join([w.lemma_ for w in nlp(row)]))

df['pp_wc'] = df['preprocessed'].progress_apply(lambda x: len([word for word in x.split(' ') if word is not '']))

df = df[(df['word_count'] > 0)]
df.head()

100%|██████████| 10000/10000 [00:00<00:00, 15624.45it/s]
100%|██████████| 10000/10000 [01:36<00:00, 103.68it/s]
100%|██████████| 10000/10000 [00:00<00:00, 178551.68it/s]


Unnamed: 0,score,review,char_count,word_count,preprocessed,pp_wc
0,1,Stuning even for the non-gamer: This sound tr...,429,80,stun even non gamer sound track beautiful pain...,44
1,1,The best soundtrack ever to anything.: I'm re...,512,97,good soundtrack ever anything I m read lot rev...,54
2,1,Amazing!: This soundtrack is my favorite musi...,763,129,amazing soundtrack favorite music time hand do...,79
3,1,Excellent Soundtrack: I truly like this sound...,746,118,excellent soundtrack truly like soundtrack enj...,77
4,1,"Remember, Pull Your Jaw Off The Floor After H...",484,87,remember pull jaw floor hear it play game know...,51


100%|██████████| 10000/10000 [01:36<00:00, 103.26it/s]


In [20]:
for i, row in df[:2].iterrows():
    print('review: ', row['review'])
    print('preprocessed review: ', row['preprocessed'])
    print()
    print()

review:   Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^

preprocessed review:  stun even non gamer sound track beautiful paint senery mind well would recomend even people hate vid game music play game chrono cross game ever play good music back away crude keyboarding take fresher step grate guitar soulful orchestra would impress anyone care listen


review:   The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless

# Balance analysis

In [23]:
df.describe()

Unnamed: 0,score,char_count,word_count,pp_wc
count,10000.0,10000.0,10000.0,10000.0
mean,0.4903,441.7026,79.5532,43.4337
std,0.499931,239.243635,43.023095,23.721701
min,0.0,104.0,14.0,4.0
25%,0.0,241.0,43.0,24.0
50%,0.0,394.0,71.0,39.0
75%,1.0,608.0,109.0,59.0
max,1.0,1018.0,212.0,144.0


In [38]:
step = 10
df['wc_group'] = df['pp_wc'].apply(lambda x: int(x / step))
positive_df = df[df['score'] == 1]
negative_df = df[df['score'] == 0]
df.corr()

Unnamed: 0,score,char_count,word_count,pp_wc,wc_group
score,1.0,-0.044603,-0.051935,-0.033837,-0.035627
char_count,-0.044603,1.0,0.989204,0.981901,0.974718
word_count,-0.051935,0.989204,1.0,0.970073,0.963161
pp_wc,-0.033837,0.981901,0.970073,1.0,0.992786
wc_group,-0.035627,0.974718,0.963161,0.992786,1.0


In [39]:
from functools import reduce

positive_stats = positive_df['wc_group'].value_counts(sort=False).to_frame()
negative_stats = negative_df['wc_group'].value_counts(sort=False).to_frame()

stats = [positive_stats, negative_stats]
stats_df = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), stats)
stats_df.columns = ['positive', 'negative']
stats_df = stats_df.sort_index()

stats_df

Unnamed: 0,positive,negative
0,14,13
1,923,751
2,930,903
3,746,861
4,612,689
5,494,572
6,391,446
7,301,358
8,245,277
9,177,169


In [42]:
dataframes = [positive_df, negative_df]
max_word_count = 150

result_dfs = []
for dataframe in dataframes:
    restricted_df = dataframe[dataframe['pp_wc'] < max_word_count]
    for name, group in restricted_df.groupby('wc_group'):
        if name not in stats_df.index:
            continue
        
        n_samples = stats_df.loc[int(name)].min()
        sampled_df = group.sample(n=n_samples)
        
        result_dfs.append(sampled_df)
        
balanced_df = reduce(lambda left, right: left.append(right), result_dfs)
balanced_df.head()

Unnamed: 0,score,review,char_count,word_count,preprocessed,pp_wc,wc_group
7933,1,A few more chances: If life could be that way...,104,21,chance life could way chance make thing right,8,0
7657,1,I am pleased: Receiving this product was righ...,112,20,please receive product right time opinion book...,9,0
9989,1,classic: i got this for my dad. it is super c...,104,22,classic get dad super creepy worth watch watch...,9,0
4210,1,WOW!!: This product definitely does what is s...,139,27,wow product definitely say do problem thick po...,9,0
771,1,Wonderful classic: I enjoy this book very muc...,120,24,wonderful classic enjoy book much timely class...,9,0


In [43]:
balanced_df.describe()

Unnamed: 0,score,char_count,word_count,pp_wc,wc_group
count,9364.0,9364.0,9364.0,9364.0,9364.0
mean,0.5,441.658372,79.557561,43.416168,3.892354
std,0.500027,238.789478,42.942659,23.644911,2.371362
min,0.0,104.0,14.0,4.0,0.0
25%,0.0,242.0,44.0,24.0,2.0
50%,0.5,393.0,71.0,38.0,3.0
75%,1.0,607.0,109.0,59.0,5.0
max,1.0,1018.0,207.0,118.0,11.0


In [44]:
balanced_df.corr()

Unnamed: 0,score,char_count,word_count,pp_wc,wc_group
score,1.0,-0.008182,-0.016004,0.001414,-8.32008e-18
char_count,-0.008182368,1.0,0.98914,0.982274,0.9750682
word_count,-0.01600374,0.98914,1.0,0.97025,0.9633152
pp_wc,0.001413736,0.982274,0.97025,1.0,0.9927257
wc_group,-8.32008e-18,0.975068,0.963315,0.992726,1.0


# Эмбеддинги

In [46]:
import numpy as np
import mmap
embeddings_path = 'fasttext.wiki-news-300d-1M.vec'

def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

embeddings_dict = {}
with open(embeddings_path, 'r', encoding='utf-8') as file:
    for line in tqdm_notebook(file, total=get_num_lines(embeddings_path)):
        values = line.split()
        word = values[0].lower()
        if word in embeddings_dict:
            continue
        
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = vector

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=999995.0), HTML(value='')))




## Split dataframes

In [53]:
def split_dataframe(dataframe):
    test = dataframe.sample(n=500)
    train = dataframe.loc[~dataframe.index.isin(test.index)].sample(frac=1)
    
    return (train, test)

def get_train_test_sets(dataframe):
    positive_sentiment_df = dataframe[dataframe['score'] == 1]
    negative_sentiment_df = dataframe[dataframe['score'] == 0]
    
    positive_split = split_dataframe(positive_sentiment_df)
    negative_split = split_dataframe(negative_sentiment_df)

    train = positive_split[0].append(negative_split[0])
    test = positive_split[1].append(negative_split[1])
    return (train, test)

## Naive approach - bag of word representation

In [47]:
def average_vectorizations(row):
    vectors = []
    for word in row.split():
        if word in embeddings_dict:
            vectors.append(embeddings_dict[word])
    
    result_vector = np.mean(vectors, axis=0)
    return result_vector

In [51]:
naive_df = balanced_df.copy()
naive_df['vector'] = naive_df['preprocessed'].progress_apply(average_vectorizations)

100%|██████████| 9364/9364 [00:00<00:00, 12937.16it/s]


In [57]:
train, test = get_train_test_sets(naive_df)

def get_arrayed_data(df_set):
    setX = np.stack(df_set['vector'].values, axis=0)
    setY = np.stack(df_set['score'].values, axis=0)        
    return (setX, setY)

trainX, trainY = get_arrayed_data(train)
testX, testY = get_arrayed_data(test)

### KNN

In [60]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(trainX, trainY)

KNeighborsClassifier()

In [62]:
train_pred = knn.predict(trainX)
f1_train = f1_score(trainY, train_pred, average='weighted')

test_pred = knn.predict(testX)
f1_test = f1_score(testY, test_pred, average='weighted')

print(f'Train F1: {f1_train}, Test F1: {f1_test}')

Train F1: 0.8279361884048898, Test F1: 0.7449426120877197


## Sklearn models comparison

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA"
    ]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

In [65]:
for name, clf in zip(names, classifiers):
    print(name)
    clf.fit(trainX, trainY)
    
    train_pred = clf.predict(trainX)
    f1_train = f1_score(trainY, train_pred, average='weighted')

    test_pred = clf.predict(testX)
    f1_test = f1_score(testY, test_pred, average='weighted')

    print(f'Train F1: {f1_train}, Test F1: {f1_test}')
    print()
    print()

Nearest Neighbors
Train F1: 0.8602337602437479, Test F1: 0.7339829749103942


Linear SVM
Train F1: 0.7410664640754129, Test F1: 0.7531314519598843


RBF SVM
Train F1: 0.8748205150306803, Test F1: 0.853985398539854


Gaussian Process
Train F1: 0.9179816580487216, Test F1: 0.8509986589879308


Decision Tree
Train F1: 0.7271028832854559, Test F1: 0.6782380314602765


Random Forest
Train F1: 0.7130369863439823, Test F1: 0.6625866686691197


Neural Net
Train F1: 0.821854746426091, Test F1: 0.8269084345618832


AdaBoost
Train F1: 0.7953109533071439, Test F1: 0.7608505315822389


Naive Bayes
Train F1: 0.7533539311889077, Test F1: 0.7725393922693454


QDA
Train F1: 0.9354359616444531, Test F1: 0.8029130846703396


