In [48]:
import pandas as pd
import numpy as np
import nltk
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
# read data
df_train = pd.read_csv("../dataset/reviews/train.csv")
df_test = pd.read_csv("../dataset/reviews/test.csv")

In [6]:
df_train.isna().sum()

text     0
label    0
dtype: int64

In [8]:
df_test.isna().sum()

text     0
label    0
dtype: int64

In [11]:
df_train.text = df_train.text.str.lower()
df_test.text = df_test.text.str.lower()

In [12]:
df_train.text = df_train.text.replace("[^a-zA-Z ]", "", regex=True)
df_test.text = df_test.text.replace("[^a-zA-Z ]", "", regex=True)

In [15]:
df_train.text = df_train.text.str.split()
df_test.text = df_test.text.str.split()

In [16]:
df_test.text

0       [lovingly, photographed, in, the, manner, of, ...
1                [consistently, clever, and, suspenseful]
2       [its, like, a, big, chill, reunion, of, the, b...
3       [the, story, gives, ample, opportunity, for, l...
4                     [red, dragon, never, cuts, corners]
                              ...                        
1061    [a, terrible, movie, that, some, people, will,...
1062    [there, are, many, definitions, of, time, wast...
1063    [as, it, stands, crocodile, hunter, has, the, ...
1064    [the, thing, looks, like, a, madeforhomevideo,...
1065    [enigma, is, wellmade, but, its, just, too, dr...
Name: text, Length: 1066, dtype: object

In [18]:
# create batches
ps = nltk.stem.PorterStemmer()
stopwords = nltk.corpus.stopwords.words("english")

In [19]:
def apply_function(x):
    d = [ps.stem(kelime) for kelime in x if kelime not in stopwords]
    return " ".join(d)

In [20]:
df_train.text = df_train.text.apply(apply_function)

In [22]:
df_test.text = df_test.text.apply(apply_function)

In [23]:
# target astype
df_train.label = df_train.label.astype(int)
df_test.label = df_test.label.astype(int)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
cv = CountVectorizer(max_features=2000)

In [26]:
import joblib

In [27]:
joblib.dump(cv, "../count_vectorizer.pkl")

['../count_vectorizer.pkl']

In [28]:
X_train = cv.fit_transform(df_train.text).toarray()
X_test = cv.transform(df_test.text).toarray()
y_train = np.array(df_train.label)
y_test = np.array(df_test.label)

In [61]:
# random forests model
rf = RandomForestClassifier(n_estimators=600, n_jobs=4, max_depth=300)
rf.fit(X_train, y_train)

In [62]:
rf_predictions = rf.predict(X_test)

In [33]:
logistic = LogisticRegression(C=2)
logistic.fit(X_train, y_train)

In [34]:
logistic_predicted = logistic.predict(X_test)

In [36]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

In [37]:
nb_predict = naive_bayes.predict(X_test)

In [45]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)

In [46]:
knn_pred = knn.predict(X_test)

In [49]:
svm = SVC(C=1, kernel="rbf")
svm.fit(X_train, y_train)

In [50]:
svm_predictions = svm.predict(X_test)

In [39]:
from sklearn.metrics import accuracy_score, f1_score

In [41]:
def evaluate(predictions, modelname):
    print(f"{modelname} results\n----------------------")
    print(accuracy_score(y_test, predictions))
    print(f1_score(y_test, predictions))

In [63]:
evaluate(rf_predictions, "random forests")

random forests results
----------------------
0.725140712945591
0.7190795781399808


In [43]:
evaluate(logistic_predicted, "logistic")

logistic results
----------------------
0.7354596622889306
0.7354596622889306


In [44]:
evaluate(nb_predict, "naive bayes")

naive bayes results
----------------------
0.7138836772983115
0.7345517841601392


In [47]:
evaluate(knn_pred, "knn")

knn results
----------------------
0.550656660412758
0.5609532538955087


In [51]:
evaluate(svm_predictions, "svm")

svm results
----------------------
0.7373358348968105
0.7348484848484849


In [74]:
import re
def preprocess(sentence):
    sentence = sentence.lower()
    result = apply_function(sentence.split())
    return [result]

In [86]:
# custom input
my_input = "It was absolutely lovely, for childrens the best choice"
my_input = preprocess(my_input)
my_input = cv.transform(my_input)

In [87]:
logistic.predict_proba(my_input)

array([[0.41585988, 0.58414012]])

In [88]:
logistic.classes_

array([0, 1])

In [89]:
knn.predict(my_input)

array([1])

In [92]:
naive_bayes.predict(my_input.toarray())

array([1])

In [95]:
svm.predict(my_input.toarray())

array([1])