In [1]:
import pandas as pd
import sys
import warnings
import nltk
import seaborn as sns
import re
import plotly
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest, SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from skopt import BayesSearchCV

if not sys.warnoptions:
    warnings.simplefilter("ignore")

#Read data from file
path_to_file="./googleplaystore_user_reviews.csv"
data = pd.read_csv(path_to_file, encoding="utf-8")
data.head

<bound method NDFrame.head of                                App  \
0            10 Best Foods for You   
1            10 Best Foods for You   
2            10 Best Foods for You   
3            10 Best Foods for You   
4            10 Best Foods for You   
5            10 Best Foods for You   
6            10 Best Foods for You   
7            10 Best Foods for You   
8            10 Best Foods for You   
9            10 Best Foods for You   
10           10 Best Foods for You   
11           10 Best Foods for You   
12           10 Best Foods for You   
13           10 Best Foods for You   
14           10 Best Foods for You   
15           10 Best Foods for You   
16           10 Best Foods for You   
17           10 Best Foods for You   
18           10 Best Foods for You   
19           10 Best Foods for You   
20           10 Best Foods for You   
21           10 Best Foods for You   
22           10 Best Foods for You   
23           10 Best Foods for You   
24           10 Best

In [2]:
# Neded fields: Translated_Review abd Sentiment
data = data[["Translated_Review", "Sentiment"]]
data.shape

(64295, 2)

In [3]:
# dropping nan's and duplicates
data = data.dropna()
data = data.drop_duplicates()
print(data.shape)

(27994, 2)


In [4]:
# resetting indexes of dataframe rows
data = data.reset_index(drop=True)

In [5]:
# Now we need to remove all symbols from rows like: !@#$%^&*()...
# I will write some function for that using a little of regexp

def tokenize(text):
    text = re.split('\W+', text)
    tokens = []
    for word in text:
        if word.isalpha():
            tokens.append(word.lower())
    return tokens

data["Translated_Review"] = data["Translated_Review"].apply(lambda x: tokenize(x))
data.loc[:10]

Unnamed: 0,Translated_Review,Sentiment
0,"[i, like, eat, delicious, food, that, s, i, m,...",Positive
1,"[this, help, eating, healthy, exercise, regula...",Positive
2,"[works, great, especially, going, grocery, store]",Positive
3,"[best, idea, us]",Positive
4,"[best, way]",Positive
5,[amazing],Positive
6,"[looking, forward, app]",Neutral
7,"[it, helpful, site, it, help, foods, get]",Neutral
8,"[good, you]",Positive
9,"[useful, information, the, amount, spelling, e...",Positive


In [6]:
# After that I will remove all stop words from Translated_Review column
stopwords = nltk.corpus.stopwords.words("english")

def remove_stopwords(token_list):
    return [word for word in token_list if word not in stopwords]

data["Translated_Review"] = data["Translated_Review"].apply(lambda x: remove_stopwords(x))
data.loc[:10]

Unnamed: 0,Translated_Review,Sentiment
0,"[like, eat, delicious, food, cooking, food, ca...",Positive
1,"[help, eating, healthy, exercise, regular, basis]",Positive
2,"[works, great, especially, going, grocery, store]",Positive
3,"[best, idea, us]",Positive
4,"[best, way]",Positive
5,[amazing],Positive
6,"[looking, forward, app]",Neutral
7,"[helpful, site, help, foods, get]",Neutral
8,[good],Positive
9,"[useful, information, amount, spelling, errors...",Positive


In [7]:
# Making lemmatization of Translated_Review column
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize(token_list):
    return [lemmatizer.lemmatize(word) for word in token_list]

data["Translated_Review"] = data["Translated_Review"].apply(lambda x: lemmatize(x))
data.loc[:10]

Unnamed: 0,Translated_Review,Sentiment
0,"[like, eat, delicious, food, cooking, food, ca...",Positive
1,"[help, eating, healthy, exercise, regular, basis]",Positive
2,"[work, great, especially, going, grocery, store]",Positive
3,"[best, idea, u]",Positive
4,"[best, way]",Positive
5,[amazing],Positive
6,"[looking, forward, app]",Neutral
7,"[helpful, site, help, food, get]",Neutral
8,[good],Positive
9,"[useful, information, amount, spelling, error,...",Positive


In [8]:
# I think that the best transformer would be CountVectorizer, so we need to make our reviews string again

def make_string_again(token_list):
    return " ".join(word for word in token_list)

data["Translated_Review"] = data["Translated_Review"].apply(lambda x: make_string_again(x))
data.loc[: 10]

Unnamed: 0,Translated_Review,Sentiment
0,like eat delicious food cooking food case best...,Positive
1,help eating healthy exercise regular basis,Positive
2,work great especially going grocery store,Positive
3,best idea u,Positive
4,best way,Positive
5,amazing,Positive
6,looking forward app,Neutral
7,helpful site help food get,Neutral
8,good,Positive
9,useful information amount spelling error quest...,Positive


In [9]:
# As I said above I will CountVectorizer as a transformator
# I can also make it with TfidfVectorizer to compare it with results of CountVectorizer

vector = CountVectorizer()
x_vect = vector.fit_transform(data["Translated_Review"])

# After fitting data to CountVectorizer let's see what model will have best score out of all
# But before that, we need to split data into train set and test set.
# I wiil use a simple train_test_split method from sklearn.model_selection

X_train, X_test, y_train, y_test = train_test_split(x_vect, data["Sentiment"], train_size=0.8, random_state=0)
models = [LogisticRegression(), MultinomialNB(), LinearSVC(), SVC(kernel="linear"), RandomForestClassifier()]
for model in models:
    model.fit(X_train, y_train)
    print("Score for model: {} is {}".format(model.__class__.__name__, model.score(X_test, y_test)))

Score for model: LogisticRegression is 0.8933738167529917
Score for model: MultinomialNB is 0.7497767458474728
Score for model: LinearSVC is 0.8873013038042508
Score for model: SVC is 0.8962314699053402
Score for model: RandomForestClassifier is 0.8231827111984283


In [10]:
# Not bad results. But let's first of all see the number of features of our reviews
print(X_train.shape, X_test.shape)

(22395, 17557) (5599, 17557)


In [11]:
# Wow. 18.5k features. Thats a big number. 
# Let's try to reduce the number of features using SelectPercentile and test it on models

for model in models:
    selector = SelectPercentile(percentile=5)
    X_train_sel = selector.fit_transform(X_train, y_train)
    X_test_sel = selector.transform(X_test)
    model.fit(X_train_sel, y_train)
    print("Score for model: {} is {}".format(model.__class__.__name__, model.score(X_test_sel, y_test)))

Score for model: LogisticRegression is 0.9021253795320593
Score for model: MultinomialNB is 0.7490623325593856
Score for model: LinearSVC is 0.906590462582604
Score for model: SVC is 0.9076620825147348
Score for model: RandomForestClassifier is 0.8528308626540454


In [12]:
# Now it is better and also it reduces overall time of execution
# Let's try it with TfidfVectorizer

tfidf_vect = TfidfVectorizer()
x_vect = tfidf_vect.fit_transform(data["Translated_Review"])
X_train, X_test, y_train, y_test = train_test_split(x_vect, data["Sentiment"], train_size=0.8, random_state=0)
for model in models:
    selector = SelectPercentile(percentile=5)
    X_train_sel = selector.fit_transform(X_train, y_train)
    X_test_sel = selector.transform(X_test)
    model.fit(X_train_sel, y_train)
    print("Score for model: {} is {}".format(model.__class__.__name__, model.score(X_test_sel, y_test)))

Score for model: LogisticRegression is 0.8622968387212002
Score for model: MultinomialNB is 0.6906590462582604
Score for model: LinearSVC is 0.8962314699053402
Score for model: SVC is 0.8814073941775317
Score for model: RandomForestClassifier is 0.8510448294338274


In [None]:
# TfidfVectorizer a bit worst than CountVectorizer
# Now let fine-tune model using GridSearchCV

param_grid = {'C': np.arange(0.01, 100, 10)}
linearSVC = GridSearchCV(LinearSVC(), param_grid, cv=5, return_train_score=True)
linearSVC.fit(X_train, y_train)
print(linearSVC.best_params_)

bestlinearSVC = linearSVC.best_estimator_
bestlinearSVC.fit(X_train,y_train)
bestlinearSVC.coef_ = bestlinearSVC.named_steps['SVC'].coef_
bestlinearSVC.score(X_train,y_train)