In [30]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import svm

from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from datetime import datetime
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler 
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN


In [2]:
data = pd.read_csv('rewrite_video_game_.csv',
                   encoding='utf-8',  
                    engine='python',
                   memory_map=True,
                   error_bad_lines=False,
                    na_values='NaN')

In [None]:
data.info()

In [3]:
data=data.dropna()

In [None]:
data=data.sample(100000)

In [None]:
data['pos_neg']=['Positive' if x> 3 else 'Negative' for x in data.star_rating]

In [None]:
data=data[['review_body','review_headline','star_rating','pos_neg']]

In [None]:
data.to_csv('rewrite_video_game_.csv', encoding='utf-8')

In [26]:
X=data['review_body'].values
y=data['star_rating']

In [33]:
y=data['pos_neg']

In [15]:
X.shape, y.shape

((99994,), (99994,))

In [17]:
#TfidfVectorizer
vect=TfidfVectorizer()
X = vect.fit_transform(X)
print(X.shape)

(99994, 82300)


In [18]:
# Perform SMOTE
t1 = datetime.now()
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_sample(X, y) 
print('Resampled dataset shape {}'.format(Counter(y_res)))
print(datetime.now() - t1)

Original dataset shape Counter({5: 57501, 4: 17684, 1: 10703, 3: 8743, 2: 5363})
Resampled dataset shape Counter({1: 57501, 2: 57501, 3: 57501, 4: 57501, 5: 57501})
0:10:35.437773


In [19]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

clf = svm.SVC(kernel='linear', C=1)
nbclf=MultinomialNB(alpha=1)

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
cross_val_score(nbclf, X, y, cv=cv)
                                                   

array([0.57535251, 0.57845262, 0.57648588, 0.57755259, 0.57581919,
       0.5760192 , 0.57571919, 0.58055269, 0.57028568, 0.57348578])

In [20]:
from sklearn.cross_validation import cross_val_predict

from sklearn import cross_validation
from sklearn.model_selection import cross_val_score

In [34]:
nbclf=MultinomialNB(alpha=1)
svmclf=svm.SVC(C=1.0,kernel='linear')
from sklearn.pipeline import Pipeline
text_clf=Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
score = cross_validation.cross_val_predict(text_clf,X,y,cv=10)
print(classification_report(y,score))

             precision    recall  f1-score   support

   Negative       0.98      0.07      0.13     24809
   Positive       0.76      1.00      0.87     75185

avg / total       0.82      0.77      0.68     99994



In [None]:
train_dtm

In [None]:
# SMOTE
t1 = datetime.now()
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=100)
X_res, y_res = sm.fit_sample(train_dtm, y) 
print('Resampled dataset shape {}'.format(Counter(y_res)))
print(datetime.now() - t1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res , test_size=0.3, random_state=100)

In [None]:
X_test

In [None]:
X_train

In [None]:

from sklearn.metrics import confusion_matrix, classification_report
t1 = datetime.now()
#LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)

print(classification_report(y_test,y_pred_class))
print(datetime.now() - t1)

In [None]:
# Logistic Regression CV
from sklearn.linear_model import LogisticRegressionCV
t1 = datetime.now()

logreg = LogisticRegressionCV(cv=10)
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)

print(classification_report(y_test,y_pred_class))
print(datetime.now() - t1)

In [None]:
#LinearSVC
from sklearn.svm import LinearSVC
t1 = datetime.now()

#LinearSVC
lin_clf = LinearSVC()
clf=lin_clf.fit(X_train, y_train)
y_pred_class = clf.predict(X_test)
print(classification_report(y_test,y_pred_class))
print(datetime.now() - t1)

In [None]:
#CountVectorizer
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)

train_dtm

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res , test_size=0.3, random_state=100)

#RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler

t1 = datetime.now()
print('Original dataset shape {}'.format(Counter(y)))
rus = RandomUnderSampler(random_state=100)
X_res, y_res = rus.fit_sample(train_dtm, y)
print('Resampled dataset shape {}'.format(Counter(y_res)))
print(datetime.now() - t1)

In [None]:
from collections import Counter

def balance_classes(xs, ys):
    freqs = Counter(ys)

# the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [None]:
# split the new DataFrame into training and testing sets

In [None]:
print(Counter(y))
balanced_x, balanced_y = balance_classes(X, y)
print(Counter(balanced_y))

In [None]:
balanced_x

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(balanced_x, balanced_y, test_size=0.3, random_state=100)

In [None]:
vect = CountVectorizer()
train_dtm = vect.fit_transform(X_train)

test_dtm = vect.transform(X_test)


In [None]:
X_test

In [None]:
test_dtm

In [None]:
train_dtm

In [None]:
from sklearn.svm import LinearSVC
#LinearSVC
lin_clf = LinearSVC()
lin_clf.fit(train_dtm, y_train)
y_pred_class = logreg.predict(test_dtm)
print(classification_report(y_test,y_pred_class))

In [None]:
#LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(train_dtm, y_train)
y_pred_class = logreg.predict(test_dtm)
print (metrics.accuracy_score(y_test, y_pred_class))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test,y_pred_class ))
print('\n')
print(classification_report(y_test, y_pred_class))

In [None]:
# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(train_dtm, y_train)
y_pred_class = nb.predict(test_dtm)
print(classification_report(y_test,y_pred_class))

In [None]:
texts=data['review_body'].values
stars=data['star_rating']

In [None]:
#balance_class

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime

In [None]:
# This vectorizer breaks text into single words and bi-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,2))
t1 = datetime.now()

# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(balanced_x)
print(datetime.now() - t1)

In [None]:
print(X_train)

In [None]:

from sklearn.svm import LinearSVC
# initialise the SVM classifier
classifier = LinearSVC()
 
# train the classifier
t1 = datetime.now()
classifier.fit(X_train, y_train)
print(datetime.now() - t1)

In [None]:
preds = classifier.predict(X_test)
print(list(preds[:10]))
print(y_test[:10])


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))
 
