In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")
data_solution = pd.read_csv("solution.csv")

In [3]:
data_test["label"] = data_solution["Predicted"]
dataset = pd.concat([data_train, data_test])

In [4]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,review,label
0,0,I think they really let the quality of the DVD...,0
1,1,I'm sorry but this is just awful. I have told ...,0
2,2,"The Japenese sense of pacing, editing and musi...",0
3,3,"In the '60's/'70's, David Jason was renowned f...",1
4,4,"""Hail The Woman"" is one of the most moving fil...",1


In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    dataset["review"], dataset["label"], test_size=0.3, random_state=13
)

In [6]:
cv = CountVectorizer(max_features=2000)
cv.fit(x_train)

CountVectorizer(max_features=2000)

In [7]:
x_train_cv = cv.transform(x_train)
x_test_cv = cv.transform(x_test)

In [8]:
def algo(algo_name, algo):
    algo.fit(x_train_cv.toarray(), y_train)
    y_pred = algo.predict(x_test_cv.toarray())
    print(algo_name + " Accuracy: ", accuracy_score(y_pred, y_test))
    print(algo_name + " confusion_matrix: ")
    print(confusion_matrix(y_pred, y_test))

In [10]:
mnb = MultinomialNB()
bnb = BernoulliNB()
gnb = GaussianNB()
dtc = DecisionTreeClassifier(min_samples_split=7)
knn = KNeighborsClassifier(n_neighbors=7)

algo("MultinomialNB", mnb)
algo("BernoulliNB", bnb)
algo("GaussianNB", gnb)
algo("DecisionTreeClassifier", dtc)
algo("KNeighborsClassifier", knn)

MultinomialNB Accuracy:  0.8320666666666666
MultinomialNB confusion_matrix: 
[[6319 1357]
 [1162 6162]]
BernoulliNB Accuracy:  0.8417333333333333
BernoulliNB confusion_matrix: 
[[6142 1035]
 [1339 6484]]
GaussianNB Accuracy:  0.7927333333333333
GaussianNB confusion_matrix: 
[[6442 2070]
 [1039 5449]]
DecisionTreeClassifier Accuracy:  0.7115333333333334
DecisionTreeClassifier confusion_matrix: 
[[5378 2224]
 [2103 5295]]
KNeighborsClassifier Accuracy:  0.6386
KNeighborsClassifier confusion_matrix: 
[[4026 1966]
 [3455 5553]]
