In [5]:
import pandas as pd

In [6]:
data = pd.read_csv("Data/train.csv")
data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [4]:
data["label"].unique()

array([0, 1], dtype=int64)

In [5]:
data.shape

(40000, 2)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [7]:
def preprocess_data(X):
    X["text"] = data["text"].str.strip().str.lower()
    return X

In [8]:
x = data["text"]
y = data["label"]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.25)

In [10]:
%%time
# Vectorize text to numbers
vec = CountVectorizer(stop_words="english")
x_train = vec.fit_transform(x_train).toarray()
x_test = vec.transform(x_test).toarray()

Wall time: 10.6 s


In [11]:
%%time
base_model = MultinomialNB()
base_model.fit(x_train, y_train)

Wall time: 3min 51s


MultinomialNB()

In [12]:
base_line = base_model.score(x_test, y_test)

In [13]:
base_line

0.857

# Part - 2 Naive Bayes From scratch

In [2]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [46]:
class TextProcess:

    def __init__(self, data):
        self.stem = PorterStemmer()
        self.remove_punc = RegexpTokenizer(r"\w+")
        self.stop_words = set(stopwords.words("english"))
        self.data = data
        self.labels = data["label"]
        self.x = []
        self.specific = {"br"}

    def __clean_text__(self):
        for i in data["text"]:
            self.x.append(self.__clean__(i))

    def __clean__(self, string):
        text = list(self.remove_punc.tokenize(string.lower()))
        processed_text = []
        for i in text:
            if i in self.stop_words and i != "not" or i in self.specific or i.isdigit():
                continue
            processed_text.append(self.stem.stem(i))
        return processed_text

    def get_data(self):
        self.__clean_text__()
        return self.x, self.labels

In [47]:
cleaned = TextProcess(data)
x, y = cleaned.get_data()

In [48]:
print(x[0])

['grew', 'b', 'watch', 'love', 'thunderbird', 'mate', 'school', 'watch', 'play', 'thunderbird', 'school', 'lunch', 'school', 'want', 'virgil', 'scott', 'one', 'want', 'alan', 'count', 'becam', 'art', 'form', 'took', 'children', 'see', 'movi', 'hope', 'would', 'get', 'glimps', 'love', 'child', 'bitterli', 'disappoint', 'high', 'point', 'snappi', 'theme', 'tune', 'not', 'could', 'compar', 'origin', 'score', 'thunderbird', 'thank', 'earli', 'saturday', 'morn', 'one', 'televis', 'channel', 'still', 'play', 'rerun', 'seri', 'gerri', 'anderson', 'wife', 'creat', 'jonatha', 'frake', 'hand', 'director', 'chair', 'version', 'complet', 'hopeless', 'wast', 'film', 'utter', 'rubbish', 'cgi', 'remak', 'may', 'accept', 'replac', 'marionett', 'homo', 'sapien', 'subsp', 'sapien', 'huge', 'error', 'judgment']


In [None]:
class Model:
    """
    Extract freq for pos, neg separately.
    count unique words, neg words, pos words.
    Calculate laplacian smoothing.
    Calculate log prior, log likelihood
    """
    def __init__(self):
        self.pos, self.neg = {}, {}
        self.v = 0
        self.pos_counts, neg_counts = 0, 0
        self.log_prior = 0
        self.ratios = {}

    def fit(self, x, y):
        pass