# Classification Using NB and SVM
in this project we are going to classify IMDB review comments with naive bayes and SVM classifiers using sklearn library

In [1]:
import pandas as pd
import numpy as np
import string
import math
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
#read dataset by path and file's name
def read_dataset(path, name):
    myFile = open(path + name, 'r')
    return myFile

In [3]:
myFile = read_dataset("datasets/", "IMDB_review_labels.txt")
lines = myFile.readlines()

In [4]:
def tokenizer(doc):
    #Normalization-------------
    #delete punctions
    edited_doc = doc.translate(str.maketrans(' ', ' ', string.punctuation))

    #lower case
    edited_doc = edited_doc.lower()

    #tokenize-----------------
    tokenized_doc = edited_doc.split()
    ps = PorterStemmer()
    tokens = []
    for token in tokenized_doc:
        #stemming
        tokens.append(ps.stem(token))

    return tokens

In [5]:
labels = np.zeros(len(lines))
tokensPerDoc = []
len_docs = len(lines)
for line in lines:
    doc_id = lines.index(line)
    tokens = tokenizer(line)
    labels[doc_id] = tokens[-1]
    tokens.pop(-1)
    tokensPerDoc.append(tokens)

tokensPerDoc[0]
mergedTokens = []
for i in range(len(tokensPerDoc)):
    mergedTokens = mergedTokens + tokensPerDoc[i]

terms = np.unique(mergedTokens)

inverted_index = np.zeros((len_docs, len(terms)))

terms_dict = dict()
for i in range(len(terms)):
    terms_dict[terms[i]] = i

#calculate weights using tfidf
#tf
for i in range(len(tokensPerDoc)):
    (unique, counts) = np.unique(tokensPerDoc[i], return_counts=True)
    temrsWithFrequency = np.asarray((unique, counts)).T

    for term in temrsWithFrequency:
        if term[0] in terms_dict:
            index = terms_dict[term[0]]
            tf = math.log(1 + int(term[1]), 10)
            inverted_index[i][index] = tf

#idf
for i in range(len(terms)):
    idf = math.log(len_docs / np.count_nonzero(inverted_index[: ,i]))
    inverted_index[:, i] = inverted_index[:, i] * idf


### classifying Using Naive Bayes

In [6]:
X_train, X_test, y_train, y_test = train_test_split(inverted_index, labels, test_size=0.2, random_state=0)
gNB = GaussianNB()
y_pred = gNB.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
error = 1 - accuracy
print("accuracy: ", accuracy*100)
print("error: ", error*100)



accuracy:  67.5
error:  32.49999999999999


### classifying Using SVM

In [7]:
X_train, X_test, y_train, y_test = train_test_split(inverted_index, labels, test_size=0.2, random_state=0)
mySvm = svm.SVC()
y_pred = mySvm.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
error = 1 - accuracy
print("accuracy: ", accuracy*100)
print("error: ", error*100)



accuracy:  70.5
error:  29.500000000000004


#### comparison
As we can see svm predicted with a higher accuracy. Both SVM and NB are good classifying methods, but NB is very sensitive in feature selection, so that can affect on accuracy. Moreover, NB is so simple and it consider attributes independence so it may decrease it's accuracy. However, that's the reason that make it so fast and easy to use. On the otherside, Svm is one of the best classifying methods for both linear and non-linear datasets, but it has a long and heavy training proccess.