In [1]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook, tqdm_pandas

tqdm.pandas()

# Load

In [2]:
df = pd.read_csv('amazon_reviews_sample.csv')
df = df[['score', 'review']]
df.head()

Unnamed: 0,score,review
0,1,Stuning even for the non-gamer: This sound tr...
1,1,The best soundtrack ever to anything.: I'm re...
2,1,Amazing!: This soundtrack is my favorite musi...
3,1,Excellent Soundtrack: I truly like this sound...
4,1,"Remember, Pull Your Jaw Off The Floor After H..."


In [3]:
df['char_count'] = df['review'].progress_apply(lambda row: len(row))
df['word_count'] = df['review'].progress_apply(lambda row: len(row.split()))

100%|██████████| 10000/10000 [00:00<00:00, 667128.56it/s]
100%|██████████| 10000/10000 [00:00<00:00, 161228.85it/s]


In [4]:
df.describe()

Unnamed: 0,score,char_count,word_count
count,10000.0,10000.0,10000.0
mean,0.4903,441.7026,79.5532
std,0.499931,239.243635,43.023095
min,0.0,104.0,14.0
25%,0.0,241.0,43.0
50%,0.0,394.0,71.0
75%,1.0,608.0,109.0
max,1.0,1018.0,212.0


# Preprocessing

In [5]:
df.corr()

Unnamed: 0,score,char_count,word_count
score,1.0,-0.044603,-0.051935
char_count,-0.044603,1.0,0.989204
word_count,-0.051935,0.989204,1.0


In [6]:
import re
import string
import nltk
import spacy
from nltk.corpus import stopwords

nltk.download('stopwords')
english_stopwords = stopwords.words('english')
english_stopwords = set(english_stopwords)

nlp = spacy.load('en_core_web_sm')

def collapse_same_letters(row):
    row = re.sub(r'([a-z])\1{2,}', '\g<1>', row)
    return row

def remove_stop_words(row):
    words = row.split(' ')
    row = ' '.join([word for word in words if word not in english_stopwords])
    return row

def preprocessing(row):
    row = row.lower()
    row = row.replace('\n', ' ')
    row = row.replace('\t', ' ')
    
    row = collapse_same_letters(row)    
    row = remove_stop_words(row)
        
    row = re.sub(r'[^a-z ]', ' ', row)
    row = re.sub(r'[a-z]{35,}', ' ', row)
    row = re.sub(r' {2,}', ' ', row)
    row = row.strip()
    
    return row

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dmitry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df['preprocessed'] = df['review'].progress_apply(preprocessing)
df['preprocessed'] = df['preprocessed'].progress_apply(lambda row: ' '.join([w.lemma_ for w in nlp(row)]))

df['pp_wc'] = df['preprocessed'].progress_apply(lambda x: len([word for word in x.split(' ') if word is not '']))

df = df[(df['word_count'] > 0)]
df.head()

100%|██████████| 10000/10000 [00:00<00:00, 15591.30it/s]
100%|██████████| 10000/10000 [01:33<00:00, 107.10it/s]
100%|██████████| 10000/10000 [00:00<00:00, 185195.34it/s]


Unnamed: 0,score,review,char_count,word_count,preprocessed,pp_wc
0,1,Stuning even for the non-gamer: This sound tr...,429,80,stun even non gamer sound track beautiful pain...,44
1,1,The best soundtrack ever to anything.: I'm re...,512,97,good soundtrack ever anything I m read lot rev...,54
2,1,Amazing!: This soundtrack is my favorite musi...,763,129,amazing soundtrack favorite music time hand do...,79
3,1,Excellent Soundtrack: I truly like this sound...,746,118,excellent soundtrack truly like soundtrack enj...,77
4,1,"Remember, Pull Your Jaw Off The Floor After H...",484,87,remember pull jaw floor hear it play game know...,51


In [8]:
for i, row in df[:2].iterrows():
    print('review: ', row['review'])
    print('preprocessed review: ', row['preprocessed'])
    print()
    print()

review:   Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^

preprocessed review:  stun even non gamer sound track beautiful paint senery mind well would recomend even people hate vid game music play game chrono cross game ever play good music back away crude keyboarding take fresher step grate guitar soulful orchestra would impress anyone care listen


review:   The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless

# Balance analysis

In [9]:
df.describe()

Unnamed: 0,score,char_count,word_count,pp_wc
count,10000.0,10000.0,10000.0,10000.0
mean,0.4903,441.7026,79.5532,43.4337
std,0.499931,239.243635,43.023095,23.721701
min,0.0,104.0,14.0,4.0
25%,0.0,241.0,43.0,24.0
50%,0.0,394.0,71.0,39.0
75%,1.0,608.0,109.0,59.0
max,1.0,1018.0,212.0,144.0


In [10]:
step = 10
df['wc_group'] = df['pp_wc'].apply(lambda x: int(x / step))
positive_df = df[df['score'] == 1]
negative_df = df[df['score'] == 0]
df.corr()

Unnamed: 0,score,char_count,word_count,pp_wc,wc_group
score,1.0,-0.044603,-0.051935,-0.033837,-0.035627
char_count,-0.044603,1.0,0.989204,0.981901,0.974718
word_count,-0.051935,0.989204,1.0,0.970073,0.963161
pp_wc,-0.033837,0.981901,0.970073,1.0,0.992786
wc_group,-0.035627,0.974718,0.963161,0.992786,1.0


In [11]:
from functools import reduce

positive_stats = positive_df['wc_group'].value_counts(sort=False).to_frame()
negative_stats = negative_df['wc_group'].value_counts(sort=False).to_frame()

stats = [positive_stats, negative_stats]
stats_df = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), stats)
stats_df.columns = ['positive', 'negative']
stats_df = stats_df.sort_index()

stats_df

Unnamed: 0,positive,negative
0,14,13
1,923,751
2,930,903
3,746,861
4,612,689
5,494,572
6,391,446
7,301,358
8,245,277
9,177,169


In [12]:
dataframes = [positive_df, negative_df]
max_word_count = 150

result_dfs = []
for dataframe in dataframes:
    restricted_df = dataframe[dataframe['pp_wc'] < max_word_count]
    for name, group in restricted_df.groupby('wc_group'):
        if name not in stats_df.index:
            continue
        
        n_samples = stats_df.loc[int(name)].min()
        sampled_df = group.sample(n=n_samples)
        
        result_dfs.append(sampled_df)
        
balanced_df = reduce(lambda left, right: left.append(right), result_dfs)
balanced_df.head()

Unnamed: 0,score,review,char_count,word_count,preprocessed,pp_wc,wc_group
7657,1,I am pleased: Receiving this product was righ...,112,20,please receive product right time opinion book...,9,0
9194,1,Good service: I was pleased with the conditio...,109,20,good service please condition book speed recei...,9,0
7580,1,Not used yet: These are spares for the wristr...,122,23,use yet spare wristrocket buy daughter test se...,9,0
7932,1,excellent ....: Original - fun - breathless ....,119,23,excellent original fun breathless film script ...,9,0
7775,1,Awesomeness: As sad as it is that the series ...,115,23,awesomeness sad series end least awesome movie...,9,0


In [13]:
balanced_df.describe()

Unnamed: 0,score,char_count,word_count,pp_wc,wc_group
count,9364.0,9364.0,9364.0,9364.0,9364.0
mean,0.5,441.420654,79.492311,43.418838,3.892354
std,0.500027,238.62182,42.908931,23.636165,2.371362
min,0.0,104.0,14.0,4.0,0.0
25%,0.0,242.0,44.0,24.0,2.0
50%,0.5,392.0,71.0,38.0,3.0
75%,1.0,607.0,109.0,59.0,5.0
max,1.0,1018.0,207.0,118.0,11.0


In [14]:
balanced_df.corr()

Unnamed: 0,score,char_count,word_count,pp_wc,wc_group
score,1.0,-0.007597,-0.014764,0.001473,4.599764e-15
char_count,-0.007597337,1.0,0.98913,0.982494,0.9752285
word_count,-0.01476438,0.98913,1.0,0.970408,0.9633725
pp_wc,0.001472999,0.982494,0.970408,1.0,0.9927133
wc_group,4.599764e-15,0.975228,0.963372,0.992713,1.0


# Эмбеддинги

In [15]:
import numpy as np
import mmap
embeddings_path = 'fasttext.wiki-news-300d-1M.vec'

def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

embeddings_dict = {}
with open(embeddings_path, 'r', encoding='utf-8') as file:
    for line in tqdm_notebook(file, total=get_num_lines(embeddings_path)):
        values = line.split()
        word = values[0].lower()
        if word in embeddings_dict:
            continue
        
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = vector

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


  0%|          | 0/999995 [00:00<?, ?it/s]

## Split dataframes

In [16]:
def split_dataframe(dataframe):
    test = dataframe.sample(n=500)
    train = dataframe.loc[~dataframe.index.isin(test.index)].sample(frac=1)
    
    return (train, test)

def get_train_test_sets(dataframe):
    positive_sentiment_df = dataframe[dataframe['score'] == 1]
    negative_sentiment_df = dataframe[dataframe['score'] == 0]
    
    positive_split = split_dataframe(positive_sentiment_df)
    negative_split = split_dataframe(negative_sentiment_df)

    train = positive_split[0].append(negative_split[0])
    test = positive_split[1].append(negative_split[1])
    return (train, test)

## Naive approach - bag of word representation

In [17]:
def average_vectorizations(row):
    vectors = []
    for word in row.split():
        if word in embeddings_dict:
            vectors.append(embeddings_dict[word])
    
    result_vector = np.mean(vectors, axis=0)
    return result_vector

In [18]:
naive_df = balanced_df.copy()
naive_df['vector'] = naive_df['preprocessed'].progress_apply(average_vectorizations)

100%|██████████| 9364/9364 [00:00<00:00, 15421.20it/s]


In [22]:
train, test = get_train_test_sets(naive_df)

def get_arrayed_data(df_set):
    setX = np.stack(df_set['vector'].values, axis=0)
    setY = np.stack(df_set['score'].values, axis=0)        
    return (setX, setY)

trainX, trainY = get_arrayed_data(train)
testX, testY = get_arrayed_data(test)

# Different models comparison

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import NearestCentroid

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import f1_score

In [21]:
# We can take the whole dataset since we use cross-validation based approach (HalvingGridSearchCV)
trainX, trainY = get_arrayed_data(naive_df)
print(trainX.shape, trainY.shape)

(9364, 300) (9364,)


In [22]:
def perform_search(classifier, params):
    gsh = HalvingGridSearchCV(estimator=classifier, param_grid=params, scoring='f1_weighted', verbose=1)
    gsh.fit(trainX, trainY)
    print(f'Best score: {gsh.best_score_} for config {gsh.best_params_}')
    return gsh

In [29]:
svc_linear = SVC(kernel='linear')
svc_linear_params = {
    'kernel': ['linear'],
    'C': list(np.logspace(-3, 0, 5)) + [1, 2, 5, 10, 15]
}

gsh = perform_search(svc_linear, svc_linear_params)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 1040
max_resources_: 9364
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 10
n_resources: 1040
Fitting 5 folds for each of 10 candidates, totalling 50 fits
----------
iter: 1
n_candidates: 4
n_resources: 3120
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 2
n_candidates: 2
n_resources: 9360
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best score: 0.827887275033086 for config {'C': 2, 'kernel': 'linear'}


In [30]:
svc_rbf = SVC(kernel='rbf')
svc_rbf_params = {
    'kernel': ['rbf'],
    'C': list(np.logspace(-3, 0, 5)) + [1, 2, 5, 10, 20, 50, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 2, 3, 5]
}

svc_rbf_gsh = perform_search(svc_rbf, svc_rbf_params)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 115
max_resources_: 9364
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 84
n_resources: 115
Fitting 5 folds for each of 84 candidates, totalling 420 fits
----------
iter: 1
n_candidates: 28
n_resources: 345
Fitting 5 folds for each of 28 candidates, totalling 140 fits
----------
iter: 2
n_candidates: 10
n_resources: 1035
Fitting 5 folds for each of 10 candidates, totalling 50 fits
----------
iter: 3
n_candidates: 4
n_resources: 3105
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 4
n_candidates: 2
n_resources: 9315
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best score: 0.8450569995182512 for config {'C': 2, 'gamma': 5, 'kernel': 'rbf'}


In [31]:
decision_tree = DecisionTreeClassifier()
decision_tree_params = {
    'criterion': ['gini'],
    'max_depth': [None, 3, 5, 10, 15, 20, 30, 50],
    'min_samples_split': [7, 10, 12, 18, 25, 50],
    'min_samples_leaf': [15, 20, 25, 50, 100]
}

decision_tree_gsh = perform_search(decision_tree, decision_tree_params)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 115
max_resources_: 9364
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 240
n_resources: 115
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
----------
iter: 1
n_candidates: 80
n_resources: 345
Fitting 5 folds for each of 80 candidates, totalling 400 fits
----------
iter: 2
n_candidates: 27
n_resources: 1035
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 3
n_candidates: 9
n_resources: 3105
Fitting 5 folds for each of 9 candidates, totalling 45 fits
----------
iter: 4
n_candidates: 3
n_resources: 9315
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best score: 0.6862362775912602 for config {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 15, 'min_samples_split': 12}


In [32]:
ada_boost = AdaBoostClassifier()
ada_boost_params = {
    'n_estimators': [75, 100, 125, 150, 200],
    'learning_rate': [0.01, 0.03, 0.1, 0.16, 0.215443469, 0.27, 0.5]
}

ada_boost_gsh = perform_search(ada_boost, ada_boost_params)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 346
max_resources_: 9364
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 35
n_resources: 346
Fitting 5 folds for each of 35 candidates, totalling 175 fits
----------
iter: 1
n_candidates: 12
n_resources: 1038
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 4
n_resources: 3114
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 3
n_candidates: 2
n_resources: 9342
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best score: 0.7989568993838501 for config {'learning_rate': 0.27, 'n_estimators': 200}


In [33]:
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [3, 4, 5, 7, 10, 15, 20, 25],
    'weights': ['distance', 'uniform'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40],
    'p': [1, 2]
}

knn_gsh = perform_search(knn, knn_params)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 38
max_resources_: 9364
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 384
n_resources: 38
Fitting 5 folds for each of 384 candidates, totalling 1920 fits
----------
iter: 1
n_candidates: 128
n_resources: 114
Fitting 5 folds for each of 128 candidates, totalling 640 fits
----------
iter: 2
n_candidates: 43
n_resources: 342
Fitting 5 folds for each of 43 candidates, totalling 215 fits
----------
iter: 3
n_candidates: 15
n_resources: 1026
Fitting 5 folds for each of 15 candidates, totalling 75 fits
----------
iter: 4
n_candidates: 5
n_resources: 3078
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 5
n_candidates: 2
n_resources: 9234
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best score: 0.7377511788164492 for config {'algorithm': 'kd_tree', 'leaf_size': 40, 'n_neighbors': 15, 'p': 1, 'weights': 'distance'}


In [36]:
random_forest = RandomForestClassifier()
random_forest_params = {
    'n_estimators': [100, 150, 200],
    'criterion': ['gini'],
    'max_depth': [3, 5, 10, 20],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [5, 10, 20]
}

random_forest_gsh = perform_search(random_forest, random_forest_params)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 115
max_resources_: 9364
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 108
n_resources: 115
Fitting 5 folds for each of 108 candidates, totalling 540 fits
----------
iter: 1
n_candidates: 36
n_resources: 345
Fitting 5 folds for each of 36 candidates, totalling 180 fits
----------
iter: 2
n_candidates: 12
n_resources: 1035
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 3
n_candidates: 4
n_resources: 3105
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 4
n_candidates: 2
n_resources: 9315
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best score: 0.7903357525775175 for config {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 150}


In [23]:
trainX, trainY = get_arrayed_data(train)
testX, testY = get_arrayed_data(test)

## Sklearn models comparison

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    ]

classifiers = [
    KNeighborsClassifier(algorithm='kd_tree', leaf_size=40, n_neighbors=15, p=1, weights='distance'),
    SVC(kernel="linear", C=2),
    SVC(kernel="rbf", gamma=5, C=2),
    DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=15, min_samples_split=12),
    RandomForestClassifier(criterion='gini', max_depth=20, n_estimators=150, min_samples_leaf=5, min_samples_split=15),
    AdaBoostClassifier(n_estimators=200, learning_rate=0.27)
]

In [27]:
for name, clf in zip(names, classifiers):
    print(name)
    clf.fit(trainX, trainY)
    
    train_pred = clf.predict(trainX)
    f1_train = f1_score(trainY, train_pred, average='weighted')

    test_pred = clf.predict(testX)
    f1_test = f1_score(testY, test_pred, average='weighted')

    print(f'Train F1: {f1_train}, Test F1: {f1_test}')
    print()

Nearest Neighbors
Train F1: 1.0, Test F1: 0.7468154284473381

Linear SVM
Train F1: 0.8428968413252231, Test F1: 0.8349998349998349

RBF SVM
Train F1: 0.9544475799681845, Test F1: 0.8519976319621114

Decision Tree
Train F1: 0.7288412938592259, Test F1: 0.6694168513257387

Random Forest
Train F1: 0.9947393579551174, Test F1: 0.7929948248706219

AdaBoost
Train F1: 0.8317695084839465, Test F1: 0.7949833936548862

