In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('games_600k.csv')

In [3]:
df

Unnamed: 0,reviewText,sentiment
0,Choose your career which sets your money for t...,pos
1,It took a few hours to get this up and running...,pos
2,I oredered this for a daughter who is now 33 a...,pos
3,Well I thought since this idem didn't have any...,pos
4,I got this to show my kids what I used to play...,pos
...,...,...
599995,Item broke after one charging cycle . when I r...,neg
599996,Terrible I don't recommend,neg
599997,Very poor quality. They do not hold a charge. ...,neg
599998,it broke :( where you plug the USB to charge. ...,neg


In [4]:
df.isnull().sum()

reviewText    178
sentiment       0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.isnull().sum()

reviewText    0
sentiment     0
dtype: int64

In [7]:
from IPython.display import Markdown, display
display(Markdown('> '+df['reviewText'][259000]))

> When I bough this under a recommendation from a friend I've never heard of it. What I got was a fabulous games. The graphics are good and the gameplay is awesome. The replay value is also high due to the karma and it had one of the easiest platinums to get. The story was baffling and I won't spoil it. The powers are the real attraction in the game and the range from your normal lightning bolt to shock grenades and lightning storms.

The only negative points I could add are the lack of diversity among the side missions (the main missions are totally different though) and the lack of variety among the NPCs - There seemed to be about 10-15 kinds who alway roam around.

If you won' really get stuck up on these issues you've got yourself a great game.

In [8]:
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [9]:
df['sentiment'].value_counts()

neg    299922
pos    299900
Name: sentiment, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

X = df['reviewText']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Bayes

In [12]:
text_clf_nb.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [14]:
predictions = text_clf_nb.predict(X_test)

In [16]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[88397 10616]
 [10024 88905]]


In [17]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.90      0.89      0.90     99013
         pos       0.89      0.90      0.90     98929

    accuracy                           0.90    197942
   macro avg       0.90      0.90      0.90    197942
weighted avg       0.90      0.90      0.90    197942



In [18]:
print(metrics.accuracy_score(y_test,predictions))

0.8957270311505391


# Linear SVC

In [15]:
text_clf_lsvc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [19]:
predictions = text_clf_lsvc.predict(X_test)

In [20]:
print(metrics.confusion_matrix(y_test,predictions))

[[93782  5231]
 [ 5194 93735]]


In [21]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.95      0.95      0.95     99013
         pos       0.95      0.95      0.95     98929

    accuracy                           0.95    197942
   macro avg       0.95      0.95      0.95    197942
weighted avg       0.95      0.95      0.95    197942



In [22]:
print(metrics.accuracy_score(y_test,predictions))

0.9473330571581574


In [23]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [24]:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])
text_clf_lsvc2.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['a', 'about', 'an', 'and', 'are',
                                             'as', 'at', 'be', 'been', 'but',
                                             'by', 'can', 'even', 'ever', 'for',
                                             'from', 'get', 'had', 'has',
                                             'have', 'he', 'her', 'hers', 'his',
                                             'how', 'i', 'if', 'in', 'into',
                                             'is', ...])),
                ('clf', LinearSVC())])

In [25]:
predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

[[93659  5354]
 [ 5400 93529]]


In [26]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.95      0.95      0.95     99013
         pos       0.95      0.95      0.95     98929

    accuracy                           0.95    197942
   macro avg       0.95      0.95      0.95    197942
weighted avg       0.95      0.95      0.95    197942



In [27]:
print(metrics.accuracy_score(y_test,predictions))

0.9456709541178729


In [28]:
import joblib

In [29]:
joblib.dump(text_clf_lsvc,'text_clf_lsvc.joblib')

['text_clf_lsvc.joblib']

In [30]:
model = joblib.load('text_clf_lsvc.joblib')

In [31]:
myreview = 'What a disappointment this game is after Dying Light 1.'

In [35]:
print(model.predict([myreview]))

['neg']
