In [32]:
import pandas as pd

In [2]:
data = pd.read_csv("Data/train.csv")
data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [4]:
data["label"].unique()

array([0, 1], dtype=int64)

In [5]:
data.shape

(40000, 2)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [7]:
def preprocess_data(X):
    X["text"] = data["text"].str.strip().str.lower()
    return X

In [8]:
x = data["text"]
y = data["label"]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.25)

In [10]:
%%time
# Vectorize text to numbers
vec = CountVectorizer(stop_words="english")
x_train = vec.fit_transform(x_train).toarray()
x_test = vec.transform(x_test).toarray()

Wall time: 10.6 s


In [11]:
%%time
base_model = MultinomialNB()
base_model.fit(x_train, y_train)

Wall time: 3min 51s


MultinomialNB()

In [12]:
base_line = base_model.score(x_test, y_test)

In [13]:
base_line

0.857

# Part - 2 Naive Bayes From scratch

In [37]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import numpy as np

In [9]:
# Note to myself - Either use C++ or look for any GPU library for text cleaning.
class TextProcess:

    def __init__(self, data):
        self.stem = PorterStemmer()
        self.remove_punc = RegexpTokenizer(r"\w+")
        self.stop_words = set(stopwords.words("english"))
        self.data = data
        self.labels = data["label"]
        self.x = []
        self.specific = {"br"}

    def __clean_text__(self):
        for i in data["text"]:
            self.x.append(self.__clean__(i))

    def __clean__(self, string):
        text = list(self.remove_punc.tokenize(string.lower()))
        processed_text = []
        for i in text:
            if i in self.stop_words and i != "not" or i in self.specific or i.isdigit():
                continue
            processed_text.append(self.stem.stem(i))
        return processed_text

    def get_data(self):
        self.__clean_text__()
        return self.x, self.labels

In [10]:
cleaned = TextProcess(data)
x, y = cleaned.get_data()

In [11]:
print(x[0])

['grew', 'b', 'watch', 'love', 'thunderbird', 'mate', 'school', 'watch', 'play', 'thunderbird', 'school', 'lunch', 'school', 'want', 'virgil', 'scott', 'one', 'want', 'alan', 'count', 'becam', 'art', 'form', 'took', 'children', 'see', 'movi', 'hope', 'would', 'get', 'glimps', 'love', 'child', 'bitterli', 'disappoint', 'high', 'point', 'snappi', 'theme', 'tune', 'not', 'could', 'compar', 'origin', 'score', 'thunderbird', 'thank', 'earli', 'saturday', 'morn', 'one', 'televis', 'channel', 'still', 'play', 'rerun', 'seri', 'gerri', 'anderson', 'wife', 'creat', 'jonatha', 'frake', 'hand', 'director', 'chair', 'version', 'complet', 'hopeless', 'wast', 'film', 'utter', 'rubbish', 'cgi', 'remak', 'may', 'accept', 'replac', 'marionett', 'homo', 'sapien', 'subsp', 'sapien', 'huge', 'error', 'judgment']


In [48]:
# Note to myself - Optimize this with CUDA.

class Model:
    """
    Extract freq for pos, neg separately.
    count unique words, neg words, pos words.
    Calculate laplacian smoothing.
    Calculate log prior, log likelihood
    """
    def __init__(self):
        self.pos, self.neg = {}, {}
        self.unique = 0.0
        self.pos_val, self.neg_val = 0.0, 0.0
        self.prior = None
        self.sen = None
        self.labels = None

    def build(self, sen, labels):
        self.sen = sen
        self.labels = labels
        for i, texts in enumerate(sen):
            for j in texts:
                if labels[i] == 1:
                    if j not in self.pos:
                        self.pos[j] = 1
                        self.pos_val += 1
                        self.unique += 1
                    else:
                        self.pos[j] += 1
                if labels[i] == 0:
                    if j not in self.neg:
                        self.neg[j] = 1
                        self.neg_val += 1
                        self.unique += 1
                    else:
                        self.neg[j] += 1
        self.prior = self.pos_val / self.neg_val
        print("Build Complete (prior value: {})".format(self.prior))

    def laplacian_smoothing(self, val, n_class, total):
        return (val + 1) / (n_class + total)

    def calculate_lambda(self, text):
        val = 0.0
        for i in text:
            up = self.pos[i] if i in self.pos else 0
            down = self.neg[i] if i in self.neg else 0
            up = self.laplacian_smoothing(up, self.pos_val, self.unique)
            down = self.laplacian_smoothing(down, self.neg_val, self.unique)
            val = np.log(val) + np.log(up / down)
        return val + self.prior


    def predict(self, sentences, labels):
        predict_labels = []
        for i in sentences:
            val = self.calculate_lambda(i)
            if val > 0:
                predict_labels.append(1)
            else:
                predict_labels.append(0)
        res = 0
        for i in range(len(labels)):
            if labels[i] == predict_labels[i]:
                res += 1
        return res / len(labels)

In [49]:
# Testing
model_1 = Model()
model_1.build(x, y)

Build Complete (prior value: 1.046548885229852)


In [None]:
preds = model_1.predict(x, y)

In [51]:
print(preds)

0.500475


## Part 3 - Implementing Deep Learning Model (BERT) without Transfer learning