In [114]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
df = pd.read_csv("scraping.csv")

In [115]:
df.head()

Unnamed: 0,Comment,Rating,r
0,From last 5 years my younger brother was using...,5,1
1,excellent phone camera is very nice and the st...,4,1
2,I have been using the earlier versions of iPho...,4,1
3,IMPORTANT NOTICEIf you buy some apple device o...,5,1
4,"Well, what can I say... iPhone is awesome as e...",5,1


In [116]:
df.shape

(827, 3)

In [117]:
df['r'].value_counts()

1    774
0     53
Name: r, dtype: int64

In [118]:
from sklearn.model_selection import train_test_split
X_train_data,x_test_data,Y_train_data,y_test_data = train_test_split(df["Comment"],df["r"],test_size=0.2)


# Data Transformation

In [119]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words="english")
tfidf_vector.fit(X_train_data)
X_train_data_new=tfidf_vector.transform(X_train_data)
x_test_data_new=tfidf_vector.transform(x_test_data)


In [120]:
print(x_test_data_new.shape)
print(X_train_data_new.shape)

(166, 2166)
(661, 2166)


# Smote

In [121]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_data_new.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))


print("Before OverSampling, counts of label '1': {}".format(sum(Y_train_data == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(Y_train_data == 0)))
  
# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='minority',random_state = 2)
X_train_res, y_train_res = sm.fit_sample(X_train_data_new, Y_train_data.ravel())
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_data_new.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

print('\n')
print(X_train_data_new.shape)
print("res :-",X_train_res.shape)
print("res :-",y_train_res.shape)
print('\n')
print(x_test_data_new.shape)
print(y_test_data.shape)

After OverSampling, the shape of train_X: (661, 2166)
After OverSampling, the shape of train_y: (1240,) 

Before OverSampling, counts of label '1': 617
Before OverSampling, counts of label '0': 44 

After OverSampling, the shape of train_X: (661, 2166)
After OverSampling, the shape of train_y: (1234,) 

After OverSampling, counts of label '1': 617
After OverSampling, counts of label '0': 617


(661, 2166)
res :- (1234, 2166)
res :- (1234,)


(166, 2166)
(166,)


# Model

# Logistic Regression

In [122]:
predictions = dict()
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()

lr_model.fit(X_train_res,y_train_res)
predictions["LogisticRegression"] = lr_model.predict(x_test_data_new)
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
accuracy_score(y_test_data,predictions["LogisticRegression"])


0.9337349397590361

In [123]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

# Wow our scores are getting even high scores even when applying cross validation.
from sklearn.model_selection import cross_val_score

for key, classifier in classifiers.items():
    print(key,'\n',classifier)
    classifier.fit(X_train_res,y_train_res)
    training_score = cross_val_score(classifier, X_train_data_new, Y_train_data, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")

LogisiticRegression 
 LogisticRegression()
Classifiers:  LogisticRegression Has a training score of 93.0 % accuracy score
KNearest 
 KNeighborsClassifier()
Classifiers:  KNeighborsClassifier Has a training score of 92.0 % accuracy score
Support Vector Classifier 
 SVC()
Classifiers:  SVC Has a training score of 95.0 % accuracy score
DecisionTreeClassifier 
 DecisionTreeClassifier()
Classifiers:  DecisionTreeClassifier Has a training score of 95.0 % accuracy score


In [124]:
# Using Grid SearchCV to find the best parameters.

# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV


# Logistic Regression 
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params,cv=5)
grid_log_reg.fit(X_train_res,y_train_res)
# We automatically get the logistic regression with the best parameters.
log_reg = grid_log_reg.best_estimator_
log_reg_param = grid_log_reg.param_grid


# KNN Algorithm
knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params,cv=5)
grid_knears.fit(X_train_res,y_train_res)
# KNears best estimator
knears_neighbors = grid_knears.best_estimator_
knears_neighbors_param_grid = grid_knears.param_grid


# Support Vector Classifier
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params,cv=5)
grid_svc.fit(X_train_res,y_train_res)
svc = grid_svc.best_estimator_
svc_param_grid = grid_svc.param_grid


# DecisionTree Classifier
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params,cv=5)
grid_tree.fit(X_train_res,y_train_res)
tree_clf = grid_tree.best_estimator_
tree_clf_param_grid = grid_tree.param_grid

X_train_res, y_train_res

(<1234x2166 sparse matrix of type '<class 'numpy.float64'>'
 	with 31969 stored elements in Compressed Sparse Row format>,
 array([1, 1, 1, ..., 0, 0, 0], dtype=int64))

In [125]:
# Overfitting Case

log_reg_score = cross_val_score(log_reg, X_train_res, y_train_res, cv=5)
print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')


knears_score = cross_val_score(knears_neighbors, X_train_res, y_train_res, cv=5)
print('Knears Neighbors Cross Validation Score', round(knears_score.mean() * 100, 2).astype(str) + '%')

svc_score = cross_val_score(svc, X_train_res, y_train_res, cv=5)
print('Support Vector Classifier Cross Validation Score', round(svc_score.mean() * 100, 2).astype(str) + '%')

tree_score = cross_val_score(tree_clf, X_train_res, y_train_res, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

Logistic Regression Cross Validation Score:  99.27%
Knears Neighbors Cross Validation Score 75.53%
Support Vector Classifier Cross Validation Score 99.68%
DecisionTree Classifier Cross Validation Score 87.84%


In [126]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict

log_reg_pred = cross_val_predict(log_reg, X_train_res, y_train_res, cv=5, method="decision_function")

knears_pred = cross_val_predict(knears_neighbors,X_train_res, y_train_res, cv=5)

svc_pred = cross_val_predict(svc, X_train_res, y_train_res,cv=5,method="decision_function")

tree_pred = cross_val_predict(tree_clf,X_train_res, y_train_res,cv=5)

# Logistic Regression fitted using SMOTE technique
y_pred_log_reg = log_reg.predict(x_test_data_new)

# Other models fitted with UnderSampling
y_pred_knear = knears_neighbors.predict(x_test_data_new)
y_pred_svc = svc.predict(x_test_data_new)
y_pred_tree = tree_clf.predict(x_test_data_new)

In [127]:
log_reg_cf = confusion_matrix(y_test_data, y_pred_log_reg)
kneighbors_cf = confusion_matrix(y_test_data, y_pred_knear)
svc_cf = confusion_matrix(y_test_data, y_pred_svc)
tree_cf = confusion_matrix(y_test_data, y_pred_tree)

In [130]:
from sklearn.metrics import classification_report


print('Logistic Regression:')
print(classification_report(y_test_data, y_pred_log_reg))

print('KNears Neighbors:')
print(classification_report(y_test_data, y_pred_knear))

print('Support Vector Classifier:')
print(classification_report(y_test_data, y_pred_svc))

print('Support Vector Classifier:')
print(classification_report(y_test_data, y_pred_tree))

Logistic Regression:
              precision    recall  f1-score   support

           0       0.50      0.56      0.53         9
           1       0.97      0.97      0.97       157

    accuracy                           0.95       166
   macro avg       0.74      0.76      0.75       166
weighted avg       0.95      0.95      0.95       166

KNears Neighbors:
              precision    recall  f1-score   support

           0       0.14      1.00      0.24         9
           1       1.00      0.64      0.78       157

    accuracy                           0.66       166
   macro avg       0.57      0.82      0.51       166
weighted avg       0.95      0.66      0.75       166

Support Vector Classifier:
              precision    recall  f1-score   support

           0       0.44      0.44      0.44         9
           1       0.97      0.97      0.97       157

    accuracy                           0.94       166
   macro avg       0.71      0.71      0.71       166
weighted

#### Logistic Regression is better accuracy , precision , recall comared with other algorithm