In [21]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import time
from sklearn.model_selection import cross_val_score
import warnings 
warnings.filterwarnings('ignore')

In [22]:
data=pd.read_csv('preprocessed_file.csv')
data.head()

Unnamed: 0,Text,Score,Time
0,witty little book makes son laugh loud recite ...,1,939340800
1,grew reading sendak books watching really rosi...,1,1194739200
2,fun way children learn months year learn poems...,1,1191456000
3,great little book read aloud nice rhythm well ...,1,1076025600
4,book poetry months year goes month cute little...,1,1018396800


In [23]:
data.isnull().any()

Text      True
Score    False
Time     False
dtype: bool

In [24]:
data=data.dropna()

In [25]:
data.isnull().any()

Text     False
Score    False
Time     False
dtype: bool

In [26]:
data.head()

Unnamed: 0,Text,Score,Time
0,witty little book makes son laugh loud recite ...,1,939340800
1,grew reading sendak books watching really rosi...,1,1194739200
2,fun way children learn months year learn poems...,1,1191456000
3,great little book read aloud nice rhythm well ...,1,1076025600
4,book poetry months year goes month cute little...,1,1018396800


In [27]:
data['Time']=pd.to_datetime(data['Time'],unit='s')
data=data.sort_values('Time')
data=data.reset_index(drop=True)
data.head()

Unnamed: 0,Text,Score,Time
0,witty little book makes son laugh loud recite ...,1,1999-10-08
1,remember seeing show aired television years ag...,1,1999-10-25
2,beetlejuice well written movie everything exce...,1,1999-12-02
3,twist rumplestiskin captured film starring mic...,1,1999-12-06
4,beetlejuice excellent funny movie keaton hilar...,1,2000-01-03


In [28]:
data.shape

(363180, 3)

In [29]:
data=data.sample(100000,random_state=42,replace=True)

In [30]:
x=data["Text"]
y=data["Score"]
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,test_size=0.3,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(70000,) (70000,)
(30000,) (30000,)


In [31]:
y.value_counts()

1    84496
0    15504
Name: Score, dtype: int64

## Applying Bag of Words

In [32]:
count_vect = CountVectorizer().fit(x_train) #in scikit-learn

In [33]:
X_train_bow=count_vect.transform(x_train)
X_test_bow=count_vect.transform(x_test)

In [34]:
print("Features Name:",count_vect.get_feature_names()[:10])

Features Name: ['aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaaah', 'aaaaah', 'aaaahhhhhh', 'aaaallll', 'aaaand', 'aaaarrrrghh']


In [35]:
scalar = StandardScaler(with_mean=False)
scalar.fit(X_train_bow)
X_train_stand = scalar.transform(X_train_bow)
X_test_stand = scalar.transform(X_test_bow)
print("The shape of the X_train_vectors is : {}".format(X_train_stand.shape))
print("The shape of the X_test_vectors is : {}".format(X_test_stand.shape))

The shape of the X_train_vectors is : (70000, 48469)
The shape of the X_test_vectors is : (30000, 48469)


In [36]:
start = time.time()
# creating list of C
C_values = np.linspace(0.1,1,10)

cv_scores = [] # empty list that will hold cv scores

# Try each value of alpha in the below loop
for c in C_values:
    # Create an object of the class Logistic Regression with balanced class weights
    clf = LogisticRegression(C = c, class_weight = 'balanced',max_iter=5,solver='saga')
    # perform 5-fold cross validation
    # It returns the cv accuracy for each fold in a list
    scores = cross_val_score(clf,X_train_stand, y_train, cv=5, scoring='accuracy')
    # Store the mean of the accuracies from all the 5 folds
    cv_scores.append(scores.mean())

# calculate misclassification error from accuracy (error = 1 - accuracy)
cv_error = [1 - x for x in cv_scores]

# optimal (best) C is the one for which error is minimum (or accuracy is maximum)
optimal_C = C_values[cv_error.index(min(cv_error))]
print('\nThe optimal alpha is', optimal_C)

end = time.time()
print("Total time in minutes = ", (end-start)/60)


The optimal alpha is 0.6
Total time in minutes =  0.5409922122955322


In [37]:
optimal_c =LogisticRegression(C=optimal_C)
# fitting the model
optimal_c.fit(X_train_stand,y_train)
# predict the response
pred = optimal_c.predict(X_test_stand)

# evaluate accuracy
acc = accuracy_score(y_test, pred) * 100
print('\nThe accuracy of the classifier for k = %d is %f%%' % (optimal_C, acc))


The accuracy of the classifier for k = 0 is 89.543333%


In [38]:
confusion_matrix(y_test,pred)

array([[ 3206,  1445],
       [ 1692, 23657]], dtype=int64)