In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import pickle 

In [3]:
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
import re
import nltk
from nltk.corpus import stopwords

In [5]:
#Loading the dataset 

In [7]:
review_data = pd.read_csv(r'/Users/nandu/Desktop/ds/archive/Reviews.csv')

In [7]:
#Checking the first 5 datasets

In [8]:
review_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [9]:
review_data['Text'][0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [10]:
#Checking the size of the dataset

In [11]:
review_data.shape

(568454, 10)

In [12]:
#Since we are going to classify the data as negative or positive we will remove the neutral score i.e. 3

In [13]:
filtered_data = review_data[review_data['Score'] != 3]

In [14]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [15]:
#Since we have to partition the output as negative or positive we convert the values 4, 5 as positive and 1,2 as negative

In [16]:
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

In [17]:
#We apply the above function on our dataset

In [18]:
filtered_data['Score'] = filtered_data['Score'].map(partition)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Score'] = filtered_data['Score'].map(partition)


In [19]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [20]:
#We sort the data based on ProductID

In [21]:
sorted_data = filtered_data.sort_values('ProductId')

In [22]:
#We remove the duplicate values 

In [23]:
final_data=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"})
final_data.shape

(364173, 10)

In [24]:
#We check whether our dataset is balanced or impalanced that is whether we have similar amount of positive and negative reviews 

In [25]:
final_data['Score'].value_counts()

positive    307063
negative     57110
Name: Score, dtype: int64

In [26]:
#Since our dataset is extremely inbalanced, it can affect the performance of the model so we make it balanced

In [27]:
#Since we want to balance the dataset we will take 10000 of positive and 10000 of negative values

In [28]:
a = final_data[final_data['Score'] == 'positive']

In [29]:
data = a.iloc[0:10000]

In [30]:
data.shape

(10000, 10)

In [31]:
b = final_data[final_data['Score'] == 'negative']

In [32]:
data1 = b.iloc[0:10000]

In [33]:
result = pd.concat([data, data1])

In [34]:
result.shape

(20000, 10)

In [35]:
result.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
150523,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...
150505,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc..."
150506,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...
150507,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...
150508,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...


In [36]:
#We now create Bag of Words for the Text column

In [37]:
bow_vect = CountVectorizer()
bow = bow_vect.fit_transform(result['Text'].values)

In [38]:
bow.get_shape()

(20000, 30956)

In [39]:
tf_idf_vect = TfidfVectorizer()
tf_idf = tf_idf_vect.fit_transform(result['Text'].values)

In [40]:
tf_idf.shape

(20000, 30956)

In [41]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/nandu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
stop = set(stopwords.words('english'))

In [43]:
print(stop)

{'own', 'my', 'themselves', 'do', 'our', 'not', "needn't", 'which', 'him', "aren't", 'the', 'few', 'same', 'hasn', 'theirs', 'while', 'than', 'now', 'these', 'all', 'shouldn', 'into', 'had', 'being', 'nor', 'no', 'should', 'mustn', 've', 'wasn', "you've", 'm', 'yourself', 'during', 'here', 'mightn', 'to', 'further', 'where', "weren't", 'then', 're', 'won', "wouldn't", 'does', 'shan', 'with', 'some', 'any', 'his', 'over', 'very', 'o', 'couldn', 'has', 'can', 'myself', "you'll", 'by', 'once', 'out', "hasn't", "haven't", 'be', 'only', "didn't", 'what', 'yourselves', 'their', 'an', 'doing', 'how', "she's", 'didn', 'you', 'under', 'there', 'ain', 'herself', 'at', 'hers', 'because', 'me', 'against', 'through', 'again', "doesn't", 'were', 'weren', 'she', 'those', 'from', 'most', 'just', "you're", 'ours', 'll', 'so', 'or', 'them', 'each', "mustn't", 'they', 'whom', 'until', "should've", "mightn't", 'and', "wasn't", 'up', 'aren', 'haven', 'as', 'did', "shouldn't", 'will', 'd', "isn't", 't', "th

In [44]:
#Implement stemming 

In [45]:
sno = nltk.stem.SnowballStemmer('english') 

In [46]:
#The function below is used to clean HTML tages from the sentences
#What it does is checks for the tags using regular expression and then replaces the tags with a space 

In [47]:
def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')    
    cleantext = re.sub(cleanr, ' ', sentence) 
    return cleantext

In [48]:
#To check whether the above function works correct

In [49]:
cleanhtml("hello<br /br>World")

'hello World'

In [50]:
#Function to remove punctuation and special characters from sentence

In [51]:
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|@|!|^|%|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

In [52]:
#Trying out the function on a random example

In [53]:
cleanpunc("H?e^l@l/o #W(o!r%l.d,)")

'Hell o W orl d  '

In [54]:
#Now we apply the above two functions to pre process the dataset and also apply stemming

In [55]:
'''
final_string - List of cleaned sentences
'''

'\nfinal_string - List of cleaned sentences\n'

In [56]:
def preprocessing(series):
    i = 0
    str1=" "
    final_string = []    
    list_of_sent = []    
    
    all_positive_words=[] 
    all_negative_words=[] 
    
    
    for sent in series.values:
        filtered_sent = []
        sent = cleanhtml(sent)    
        sent = cleanpunc(sent)    
        
        for cleaned_words in sent.split():
            #Only consider non-numeric words with length at least 3
            if((cleaned_words.isalpha()) and (len(cleaned_words) > 2)):
                #Only consider words which are not stopwords and convert them to lowet case
                if(cleaned_words.lower() not in stop):
                    #Apply snowball stemmer and add them to the filtered_sent list
                    s = (sno.stem(cleaned_words.lower()))
                    filtered_sent.append(s)    #This contains all the cleaned words for a sentence
                    if (result['Score'].values)[i] == 'positive':
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(result['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews
        #Below list is a list of lists used as input to W2V model later
        list_of_sent.append(filtered_sent)
        #Join back all the words belonging to the same sentence
        str1 = " ".join(filtered_sent)
        #Finally add the cleaned sentence in the below list
        final_string.append(str1)
        i += 1
    return final_string, list_of_sent

In [57]:
final_string, list_of_sent = preprocessing(result['Text'])

In [58]:
#We add a new column called cleaned text in the dataset which is obtained after removing all the stopwords, irrelevant words from the sentence

In [59]:
result['CleanedText']=final_string

In [60]:
result.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
150523,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witti littl book make son laugh loud recit car...
150505,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew read sendak book watch realli rosi movi i...
150506,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn month year learn poem t...
150507,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,great littl book read nice rhythm well good re...
150508,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,book poetri month year goe month cute littl po...


In [61]:
#Bag of Words on the cleaned text

In [62]:
bow_vect = CountVectorizer()
bow = bow_vect.fit_transform(result['CleanedText'].values)
bow.shape

(20000, 19298)

In [63]:
#TF-IDF on the cleaned text

In [64]:
tf_idf_vect = TfidfVectorizer()
tf_idf = tf_idf_vect.fit_transform(result['CleanedText'].values)
tf_idf.shape

(20000, 19298)

In [65]:
#We have to shuffle the dataset before training the model

In [66]:
from sklearn.utils import shuffle

In [67]:
result = shuffle(result, random_state=42)
result.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
446252,446253,B0000DG8AR,AUZSFK6UCP8SN,AdoboRepublic,0,0,negative,1336780800,pot is broken,I bought this beautiful plant for Mother's Day...,bought beauti plant mother day arriv right tim...
412880,412881,B0000CDAY2,A1R6DKN9YQ2YU1,"Mercedes L. Johnmeyer ""The Most Happy""",1,1,positive,1242432000,Just Like Home...,Being born and bred in Southeastern Connecticu...,born bred southeastern connecticut know lobste...
559259,559260,B0001CXUF4,A17Y465B1033Y0,H. Ellerton,4,4,positive,1247443200,Excellent cocoa flavor,I love cocoa powder and I use it in many bakin...,love cocoa powder use mani bake item love flav...
86144,86145,B000084E6V,A28588OPKE5KO7,tulsadoberman,0,0,positive,1322697600,My Doberman loves them & helps keep her teeth ...,I have a 4.5 year old 70 LBS. Doberman Pinsche...,year old lbs doberman pinscher absolut love de...
446565,446566,B0007WKDTO,A1SWDJHLR2X8WA,Nicole Pavlik,0,1,negative,1350259200,Not butterfinger bbs,This product came up when I searched for butte...,product came search butterfing bbs order dissa...


In [68]:
bow_vect = CountVectorizer()
bow = bow_vect.fit_transform(result['CleanedText'].values)
bow.shape

(20000, 19298)

In [69]:
tf_idf_vect = TfidfVectorizer()
tf_idf = tf_idf_vect.fit_transform(result['CleanedText'].values)
tf_idf.shape

(20000, 19298)

In [70]:
#Since we are classifying the dataset we set all the positive values as 1 and negative values as o

In [71]:
def polarity(x):
    if x=='positive':
        return 1
    return 0

In [72]:
#We apply the above function on our dataset

In [73]:
result['Score']=result['Score'].map(polarity)

In [74]:
result.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
446252,446253,B0000DG8AR,AUZSFK6UCP8SN,AdoboRepublic,0,0,0,1336780800,pot is broken,I bought this beautiful plant for Mother's Day...,bought beauti plant mother day arriv right tim...
412880,412881,B0000CDAY2,A1R6DKN9YQ2YU1,"Mercedes L. Johnmeyer ""The Most Happy""",1,1,1,1242432000,Just Like Home...,Being born and bred in Southeastern Connecticu...,born bred southeastern connecticut know lobste...
559259,559260,B0001CXUF4,A17Y465B1033Y0,H. Ellerton,4,4,1,1247443200,Excellent cocoa flavor,I love cocoa powder and I use it in many bakin...,love cocoa powder use mani bake item love flav...
86144,86145,B000084E6V,A28588OPKE5KO7,tulsadoberman,0,0,1,1322697600,My Doberman loves them & helps keep her teeth ...,I have a 4.5 year old 70 LBS. Doberman Pinsche...,year old lbs doberman pinscher absolut love de...
446565,446566,B0007WKDTO,A1SWDJHLR2X8WA,Nicole Pavlik,0,1,0,1350259200,Not butterfinger bbs,This product came up when I searched for butte...,product came search butterfing bbs order dissa...


In [75]:
#We convert the Bag of Words into array

In [76]:
X_bow=bow.toarray()

In [77]:
#We need the score column from our dataset as it is the output column 

In [78]:
df = result[['Time','Score']]
df.head(20)

Unnamed: 0,Time,Score
446252,1336780800,0
412880,1242432000,1
559259,1247443200,1
86144,1322697600,1
446565,1350259200,0
126108,1350086400,0
530808,1235952000,0
438096,1158192000,1
458702,1276560000,0
477406,1285286400,1


In [79]:
df=df.reset_index(drop=True)
df.head()

Unnamed: 0,Time,Score
0,1336780800,0
1,1242432000,1
2,1247443200,1
3,1322697600,1
4,1350259200,0


In [80]:
#We create a dataframe of the Bag of Words output 

In [81]:
df1=pd.DataFrame(X_bow)
df1.head(10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19288,19289,19290,19291,19292,19293,19294,19295,19296,19297
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
#We concatenate the Bag of Words output with the dataframe which has Time and Score column so we have output column in this dataframe

In [83]:
new_df=pd.concat([df,df1], axis = 1)

In [84]:
new_df.shape

(20000, 19300)

In [85]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score

In [86]:
x=new_df.iloc[:,2:].values #removing score and time columns


In [87]:
y=new_df.iloc[:,1].values #just selecting Score& Time was just needed to sort

In [88]:
s = StandardScaler()
x=s.fit_transform(x)

In [89]:
#We use train test split to split the data into training and testing, wherein 67% of data is for training and 33% data is for testing

In [90]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [91]:
from sklearn.model_selection import cross_val_score

In [92]:
#We find the optimum number of clusters 

In [128]:
# creating odd list of K for KNN
neighbors = list(range(1,20,2))
# empty list that will hold cv scores
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# changing to misclassification error
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print('\nThe optimal number of neighbors is %d.' % optimal_k)

KeyboardInterrupt: 

In [129]:
#We observed that the optimal number of clusters is 5

In [94]:
#We train the KNN model

In [95]:
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [96]:
#Get the predictions on test data to check the accuracy of prediction

In [97]:
y_pred = classifier.predict(X_test)

In [98]:
test_list = []
pred_list = []
for i in range(0, 10):
    test_list.append(y_test[i])
    pred_list.append(y_pred[i])
print('Actual output', test_list)
print('Predicted output', pred_list)

Actual output [1, 0, 1, 0, 0, 0, 0, 1, 1, 1]
Predicted output [1, 0, 1, 0, 1, 0, 1, 1, 0, 1]


In [99]:
#Check the accuracy of KNN model

In [100]:
acc = accuracy_score(y_test, y_pred, normalize=True) * float(100)
acc


66.56060606060606

In [101]:
from sklearn.linear_model import LogisticRegression

In [102]:
lreg=LogisticRegression()

In [103]:
lreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [104]:
y_pred = lreg.predict(X_test)

In [105]:
test_list = []
pred_list = []
for i in range(0, 10):
    test_list.append(y_test[i])
    pred_list.append(y_pred[i])
print('Actual output', test_list)
print('Predicted output', pred_list)

Actual output [1, 0, 1, 0, 0, 0, 0, 1, 1, 1]
Predicted output [1, 0, 0, 0, 0, 1, 1, 1, 1, 1]


In [106]:
acc = accuracy_score(y_test, y_pred, normalize=True) * float(100)
acc

81.0909090909091

In [107]:
X_tfidf=tf_idf.toarray()

In [108]:
df = result[['Time','Score']]
df.head(20)

Unnamed: 0,Time,Score
446252,1336780800,0
412880,1242432000,1
559259,1247443200,1
86144,1322697600,1
446565,1350259200,0
126108,1350086400,0
530808,1235952000,0
438096,1158192000,1
458702,1276560000,0
477406,1285286400,1


In [109]:
df=df.reset_index(drop=True)
df.head()

Unnamed: 0,Time,Score
0,1336780800,0
1,1242432000,1
2,1247443200,1
3,1322697600,1
4,1350259200,0


In [110]:
df1=pd.DataFrame(X_tfidf)
df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19288,19289,19290,19291,19292,19293,19294,19295,19296,19297
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
new_df=pd.concat([df,df1], axis = 1)

In [112]:
new_df.shape

(20000, 19300)

In [113]:
x=new_df.iloc[:,2:].values #removing score and time columns


In [114]:
y=new_df.iloc[:,1].values #just selecting Score& Time was just needed to sort

In [115]:
s = StandardScaler()
x=s.fit_transform(x)

In [116]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [117]:
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [118]:
y_pred = classifier.predict(X_test)

In [119]:
test_list = []
pred_list = []
for i in range(0, 10):
    test_list.append(y_test[i])
    pred_list.append(y_pred[i])
print('Actual output', test_list)
print('Predicted output', pred_list)

Actual output [1, 0, 1, 0, 0, 0, 0, 1, 1, 1]
Predicted output [1, 0, 1, 1, 0, 0, 1, 1, 1, 1]


In [120]:
acc = accuracy_score(y_test, y_pred, normalize=True) * float(100)
acc

60.909090909090914

In [121]:
lreg=LogisticRegression()

In [122]:
lreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [123]:
y_pred = lreg.predict(X_test)

In [124]:
test_list = []
pred_list = []
for i in range(0, 10):
    test_list.append(y_test[i])
    pred_list.append(y_pred[i])
print('Actual output', test_list)
print('Predicted output', pred_list)

Actual output [1, 0, 1, 0, 0, 0, 0, 1, 1, 1]
Predicted output [1, 0, 0, 0, 0, 1, 1, 1, 1, 1]


In [125]:
acc = accuracy_score(y_test, y_pred, normalize=True) * float(100)
acc

80.13636363636364

In [126]:
#Comparing all the four models we observe that the model Logistic Regression with BOW which gave an accuracy of 81.09%

In [127]:
#We select the Logistic Regression with BOW as our final model.