In [2]:
import os
import pandas as pd
import sklearn as sk
import sklearn
from bs4 import BeautifulSoup
import re,string,unicodedata
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mat\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
#import the data
data = pd.read_csv("../res/Dataset.csv")
data.head(10)

print(data.shape)
print(data['sentiment'].value_counts())


(50000, 2)
positive    25000
negative    25000
Name: sentiment, dtype: int64


In [4]:
def cleanup(text):
    # Remove html tags
    text = BeautifulSoup(text, "html.parser").getText()
    # Remove square brackets 
    text = re.sub('\[[^]]*\]', '', text)
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    # Remove Stopwords
    #text = text.lower().split()
    # Stemming
    ps = nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])

    return text
    
tokenizer = nltk.tokenize.toktok.ToktokTokenizer()
data['review'] = data['review'].apply(cleanup)



In [5]:
stops = nltk.corpus.stopwords.words("english")

#removing the stopwords
def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stops]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
data['review'] = data['review'].apply(remove_stopwords)

In [6]:
#shuffle the data
data_norm_shuffle = data.sample(frac=1).reset_index(drop=True)

#split the data
split = 30000

train_reviews = data_norm_shuffle['review'][:split]
train_sentiments = data_norm_shuffle['sentiment'][:split]

test_reviews = data_norm_shuffle['review'][split:].reset_index(drop=True)
test_sentiments = data_norm_shuffle['sentiment'][split:].reset_index(drop=True)


print(train_reviews.shape, train_sentiments.shape)
print(train_reviews[0][0:50]," : ", train_sentiments[0])
print(test_reviews.shape, test_sentiments.shape)
print(test_reviews[0][0:50]," : ", test_sentiments[0])

(30000,) (30000,)
thought wa origin stori veri nice told think peopl  :  positive
(20000,) (20000,)
call thi sunday school movi might gener becaus eve  :  negative


In [7]:
#Count vectorizer for bag of words
cv = sk.feature_extraction.text.CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))
train_reviews_vector = cv.fit_transform(train_reviews)
test_reviews_vector = cv.transform(test_reviews)

print(train_reviews_vector.shape)
print(test_reviews_vector.shape)

(30000, 4824833)
(20000, 4824833)


In [8]:
#Tfidf vectorizer
tv = sk.feature_extraction.text.TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))
train_reviews_tvector = tv.fit_transform(train_reviews)
test_reviews_tvector = tv.transform(test_reviews)

print(train_reviews_tvector.shape)
print(test_reviews_tvector.shape)

(30000, 4824833)
(20000, 4824833)


In [9]:
#vectorize the sentient data
lb = sk.preprocessing.LabelBinarizer()
train_sentiments_vector = lb.fit_transform(data_norm_shuffle['sentiment'])

#split the data
train_sentiments = train_sentiments_vector[:split]
test_sentiments = train_sentiments_vector[split:]
print(train_sentiments.shape)
print(test_sentiments.shape)

(30000, 1)
(20000, 1)


In [10]:
def learingmethod(string, model, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments):
    model.fit(train_reviews_vector, train_sentiments)
    predictions = model.predict(test_reviews_vector)
    print(string)
    print("Accuracy: ", sk.metrics.accuracy_score(test_sentiments, predictions))
    print("Precision: ", sk.metrics.precision_score(test_sentiments, predictions))
    print("Recall: ", sk.metrics.recall_score(test_sentiments, predictions))
    print("F1: ", sk.metrics.f1_score(test_sentiments, predictions))
    print("Confusion Matrix: ", sk.metrics.confusion_matrix(test_sentiments, predictions))
    print("Classification Report: ", sk.metrics.classification_report(test_sentiments, predictions))
    #write to file
    fileout = open("../output/" + string + ".txt", "w")
    with fileout as file:
        file.write(string + "\naccuracy_score: "  + str(sk.metrics.accuracy_score(test_sentiments, predictions)) + "\nprecision_score: " + str(sk.metrics.precision_score(test_sentiments, predictions)) + "\nrecall_score: " + str(sk.metrics.recall_score(test_sentiments, predictions)) + "\nf1_score: " + str(sk.metrics.f1_score(test_sentiments, predictions)) + "\nconfusion_matrix: "  + str(sk.metrics.confusion_matrix(test_sentiments, predictions)) + "\nclassification_report\n" + str(sk.metrics.classification_report(test_sentiments, predictions)))
    

In [11]:
lr = sk.linear_model.LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)
sgd = sk.linear_model.SGDClassifier(loss='hinge', max_iter=500, random_state=42)
br = sk.linear_model.BayesianRidge()
lars = sk.linear_model.Lars()
lasso = sk.linear_model.Lasso()
ridge = sk.linear_model.Ridge()
en = sk.linear_model.ElasticNet()
perceptron = sk.linear_model.Perceptron(max_iter=500, random_state=42)
passive_aggressive = sk.linear_model.PassiveAggressiveClassifier(max_iter=500, random_state=42)
svc = sk.svm.SVC(kernel='linear', C=1, random_state=42)
linear_svc = sk.svm.LinearSVC(max_iter=500, random_state=42)
nu_svc = sk.svm.NuSVC(kernel='linear', random_state=42)

In [12]:
learingmethod("LinearRegression BOW", lr, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("LinearRegression TFIDF", lr, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

  y = column_or_1d(y, warn=True)


LinearRegression BOW
Accuracy:  0.62695
Precision:  0.8640023001725129
Recall:  0.3007104973481437
F1:  0.44614356766387053
Confusion Matrix:  [[9534  473]
 [6988 3005]]
Classification Report:                precision    recall  f1-score   support

           0       0.58      0.95      0.72     10007
           1       0.86      0.30      0.45      9993

    accuracy                           0.63     20000
   macro avg       0.72      0.63      0.58     20000
weighted avg       0.72      0.63      0.58     20000



  y = column_or_1d(y, warn=True)


LinearRegression TFIDF
Accuracy:  0.7365
Precision:  0.7382706084148926
Recall:  0.7322125487841489
F1:  0.7352290996784565
Confusion Matrix:  [[7413 2594]
 [2676 7317]]
Classification Report:                precision    recall  f1-score   support

           0       0.73      0.74      0.74     10007
           1       0.74      0.73      0.74      9993

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.74     20000



In [13]:
learingmethod("SGDClassifier BOW", sgd, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("SGDClassifier TFIDF", sgd, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

  y = column_or_1d(y, warn=True)


SGDClassifier BOW
Accuracy:  0.55105
Precision:  0.9478798586572438
Recall:  0.10737516261382968
F1:  0.1928988764044944
Confusion Matrix:  [[9948   59]
 [8920 1073]]
Classification Report:                precision    recall  f1-score   support

           0       0.53      0.99      0.69     10007
           1       0.95      0.11      0.19      9993

    accuracy                           0.55     20000
   macro avg       0.74      0.55      0.44     20000
weighted avg       0.74      0.55      0.44     20000



  y = column_or_1d(y, warn=True)


SGDClassifier TFIDF
Accuracy:  0.55005
Precision:  0.5262962962962963
Recall:  0.9953967777444211
F1:  0.6885404769321288
Confusion Matrix:  [[1054 8953]
 [  46 9947]]
Classification Report:                precision    recall  f1-score   support

           0       0.96      0.11      0.19     10007
           1       0.53      1.00      0.69      9993

    accuracy                           0.55     20000
   macro avg       0.74      0.55      0.44     20000
weighted avg       0.74      0.55      0.44     20000



In [14]:
learingmethod("BayesianRidge BOW", br, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("BayesianRidge TFIDF", br, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
learingmethod("Lars BOW", lars, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("Lars TFIDF", lars, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
learingmethod("Lasso BOW", lasso, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("Lasso TFIDF", lasso, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

Lasso BOW


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
learingmethod("Ridge BOW", ridge, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("Ridge TFIDF", ridge, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

Ridge BOW


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [15]:
learingmethod("ElasticNet BOW", en, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("ElasticNet TFIDF", en, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

ElasticNet BOW


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [16]:
learingmethod("Perceptron BOW", perceptron, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("Perceptron TFIDF", perceptron, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

  y = column_or_1d(y, warn=True)


Perceptron BOW
Accuracy:  0.7335
Precision:  0.7472166260205705
Recall:  0.7051936355448815
F1:  0.7255971993410215
Confusion Matrix:  [[7623 2384]
 [2946 7047]]
Classification Report:                precision    recall  f1-score   support

           0       0.72      0.76      0.74     10007
           1       0.75      0.71      0.73      9993

    accuracy                           0.73     20000
   macro avg       0.73      0.73      0.73     20000
weighted avg       0.73      0.73      0.73     20000



  y = column_or_1d(y, warn=True)


Perceptron TFIDF
Accuracy:  0.73575
Precision:  0.7500531123858084
Recall:  0.706594616231362
F1:  0.7276755809759365
Confusion Matrix:  [[7654 2353]
 [2932 7061]]
Classification Report:                precision    recall  f1-score   support

           0       0.72      0.76      0.74     10007
           1       0.75      0.71      0.73      9993

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.74     20000



In [17]:
learingmethod("PassiveAggressiveClassifier BOW", passive_aggressive, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("PassiveAggressiveClassifier TFIDF", passive_aggressive, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

  y = column_or_1d(y, warn=True)


PassiveAggressiveClassifier BOW
Accuracy:  0.72035
Precision:  0.7096036585365854
Recall:  0.7453217252076454
F1:  0.7270242569183464
Confusion Matrix:  [[6959 3048]
 [2545 7448]]
Classification Report:                precision    recall  f1-score   support

           0       0.73      0.70      0.71     10007
           1       0.71      0.75      0.73      9993

    accuracy                           0.72     20000
   macro avg       0.72      0.72      0.72     20000
weighted avg       0.72      0.72      0.72     20000



  y = column_or_1d(y, warn=True)


PassiveAggressiveClassifier TFIDF
Accuracy:  0.7361
Precision:  0.7301122498779893
Recall:  0.7485239667767437
F1:  0.7392034786046052
Confusion Matrix:  [[7242 2765]
 [2513 7480]]
Classification Report:                precision    recall  f1-score   support

           0       0.74      0.72      0.73     10007
           1       0.73      0.75      0.74      9993

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.74     20000



In [18]:
learingmethod("SVC BOW", svc, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("SVC TFIDF", svc, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

  y = column_or_1d(y, warn=True)


SVC BOW
Accuracy:  0.50625
Precision:  1.0
Recall:  0.011808265786050235
F1:  0.023340915834239934
Confusion Matrix:  [[10007     0]
 [ 9875   118]]
Classification Report:                precision    recall  f1-score   support

           0       0.50      1.00      0.67     10007
           1       1.00      0.01      0.02      9993

    accuracy                           0.51     20000
   macro avg       0.75      0.51      0.35     20000
weighted avg       0.75      0.51      0.35     20000



  y = column_or_1d(y, warn=True)


SVC TFIDF
Accuracy:  0.73625
Precision:  0.7329186413902053
Recall:  0.7428199739817872
F1:  0.7378360916455444
Confusion Matrix:  [[7302 2705]
 [2570 7423]]
Classification Report:                precision    recall  f1-score   support

           0       0.74      0.73      0.73     10007
           1       0.73      0.74      0.74      9993

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.74     20000



In [19]:
learingmethod("LinearSVC BOW", linear_svc, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("LinearSVC TFIDF", linear_svc, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

  y = column_or_1d(y, warn=True)


LinearSVC BOW
Accuracy:  0.51215
Precision:  0.9836065573770492
Recall:  0.024016811768237768
F1:  0.046888736934648824
Confusion Matrix:  [[10003     4]
 [ 9753   240]]
Classification Report:                precision    recall  f1-score   support

           0       0.51      1.00      0.67     10007
           1       0.98      0.02      0.05      9993

    accuracy                           0.51     20000
   macro avg       0.74      0.51      0.36     20000
weighted avg       0.74      0.51      0.36     20000



  y = column_or_1d(y, warn=True)


LinearSVC TFIDF
Accuracy:  0.73545
Precision:  0.7483100971694128
Recall:  0.7089962974081857
F1:  0.72812291249165
Confusion Matrix:  [[7624 2383]
 [2908 7085]]
Classification Report:                precision    recall  f1-score   support

           0       0.72      0.76      0.74     10007
           1       0.75      0.71      0.73      9993

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.74     20000



In [20]:
learingmethod("NuSVC BOW", nu_svc, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("NuSVC TFIDF", nu_svc, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

  y = column_or_1d(y, warn=True)


NuSVC BOW
Accuracy:  0.723
Precision:  0.7368365067546006
Recall:  0.6931852296607626
F1:  0.7143446426729916
Confusion Matrix:  [[7533 2474]
 [3066 6927]]
Classification Report:                precision    recall  f1-score   support

           0       0.71      0.75      0.73     10007
           1       0.74      0.69      0.71      9993

    accuracy                           0.72     20000
   macro avg       0.72      0.72      0.72     20000
weighted avg       0.72      0.72      0.72     20000



  y = column_or_1d(y, warn=True)


NuSVC TFIDF
Accuracy:  0.73675
Precision:  0.7397079699858041
Recall:  0.7300110077053937
F1:  0.7348274993704356
Confusion Matrix:  [[7440 2567]
 [2698 7295]]
Classification Report:                precision    recall  f1-score   support

           0       0.73      0.74      0.74     10007
           1       0.74      0.73      0.73      9993

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.74     20000



In [21]:
gnb = sk.naive_bayes.GaussianNB()
bnb = sk.naive_bayes.BernoulliNB()
mnb = sk.naive_bayes.MultinomialNB()
conb = sk.naive_bayes.ComplementNB()
canb = sk.naive_bayes.CategoricalNB()

AttributeError: module 'sklearn' has no attribute 'naive_bayes'

In [None]:
learingmethod("GaussianNB BOW", gnb, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("GaussianNB TFIDF", gnb, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

In [None]:
learingmethod("BernoulliNB BOW", bnb, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("BernoulliNB TFIDF", bnb, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

In [None]:
learingmethod("MultinomialNB BOW", mnb, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("MultinomialNB TFIDF", mnb, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

In [None]:
learingmethod("ComplementNB BOW", conb, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("ComplementNB TFIDF", conb, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)

In [None]:
learingmethod("CategoricalNB BOW", canb, train_reviews_vector, train_sentiments, test_reviews_vector, test_sentiments)
learingmethod("CategoricalNB TFIDF", canb, train_reviews_tvector, train_sentiments, test_reviews_tvector, test_sentiments)