In [36]:
import warnings
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
warnings.filterwarnings('ignore')

In [37]:
import os
def load_test_data():
    directory = os.path.dirname(os.path.realpath('COMP237_GroupProject'))
    directory = os.path.join(directory,"TestData")
    files = [f for f in os.listdir(directory)]

    # Create dataframe
    # Columns: COMMENT_ID, AUTHOR, DATE, CONTENT, TAG
    result = pd.concat(
        (pd.read_csv(os.path.join(directory, f)) for f in files))

    # Content and class matter, keep two columns only
    result = result[['CONTENT', 'CLASS']]
    return result


def load_train_data():
    directory = os.path.dirname(os.path.realpath('COMP237_GroupProject'))
    directory = os.path.join(directory, "TrainData")
    files = [f for f in os.listdir(directory)]

    # Create dataframe
    # Columns: COMMENT_ID, AUTHOR, DATE, CONTENT, TAG
    result = pd.concat(
        (pd.read_csv(os.path.join(directory, f)) for f in files))

    # Content and class matter, keep two columns only
    result = result[['CONTENT', 'CLASS']]
    return result


data = load_train_data()


In [38]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   CONTENT  350 non-null    object
 1   CLASS    350 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.6+ KB


In [39]:
data

Unnamed: 0,CONTENT,CLASS
0,i love this so much. AND also I Generate Free ...,1
1,http://www.billboard.com/articles/columns/pop-...,1
2,Hey guys! Please join me in my fight to help a...,1
3,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1
4,Hey everyone. Watch this trailer!!!!!!!! http...,1
...,...,...
345,This song means so much to me thank you soooo...,0
346,&lt;3﻿,0
347,"KATY PERRY, I AM THE ""DÉCIO CABELO"", ""DECIO HA...",1
348,Honestly speaking except taylor swift and adel...,0


In [40]:
from nltk import WordNetLemmatizer, PorterStemmer, SnowballStemmer, LancasterStemmer
def Lemmatizer(data):
    tmp = data.copy()
    tmp['CONTENT'] = tmp.apply(
        lambda x: ''.join([WordNetLemmatizer().lemmatize(word=x)for x in x['CONTENT']]), axis=1)
    return tmp


def PorterStemmer(data):
    tmp = data.copy()
    tmp['CONTENT'] = tmp.apply(
        lambda x: ''.join([PorterStemmer().stem(word=x)for x in x['CONTENT']]), axis=1)
    return tmp


def SnowballStemmer(data):
    tmp = data.copy()
    tmp['CONTENT'] = tmp.apply(
        lambda x:  ''.join([SnowballStemmer(language='english').stem(x)for x in x['CONTENT']]), axis=1)
    return tmp


def LancasterStemmer(data):
    tmp = data.copy()
    tmp['CONTENT'] = tmp.apply(
        lambda x: ''.join([LancasterStemmer().stem(word=x)for x in x['CONTENT']]), axis=1)
    return tmp


Train data

In [41]:
data['CLASS']


0      1
1      1
2      1
3      1
4      1
      ..
345    0
346    0
347    1
348    0
349    0
Name: CLASS, Length: 350, dtype: int64

In [46]:
from sklearn.model_selection import train_test_split
count_vectorizer = CountVectorizer(stop_words='english', min_df=1)

data_vectorized = count_vectorizer.fit_transform(data['CONTENT'].to_numpy())

x_train, x_test, y_train, y_test = train_test_split(
    data_vectorized, data['CLASS'].to_numpy(), test_size=0.25, random_state=420)


In [49]:
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(x_train)
classifier = MultinomialNB().fit(train_tfidf, y_train)


# Test data

In [50]:
test_tfidf = tfidf.fit_transform(x_test)
predictions = classifier.predict(test_tfidf)


In [51]:
for data, isSpam in zip(x_test, predictions):
    print('\nTest Data', "---->" ,data,'\nPrediction', "---->" , isSpam==0)


Test Data ---->   (0, 104)	1
  (0, 360)	1
  (0, 538)	1
  (0, 662)	1
  (0, 816)	1
  (0, 857)	1
  (0, 874)	1
  (0, 1072)	1
  (0, 1375)	1
  (0, 1395)	1 
Prediction ----> True

Test Data ---->   (0, 144)	1
  (0, 903)	1
  (0, 939)	1
  (0, 1526)	1 
Prediction ----> True

Test Data ---->   (0, 665)	1
  (0, 816)	1
  (0, 1286)	1 
Prediction ----> True

Test Data ---->   (0, 23)	1
  (0, 35)	1
  (0, 48)	1
  (0, 81)	1
  (0, 94)	1
  (0, 166)	2
  (0, 172)	1
  (0, 293)	1
  (0, 315)	1
  (0, 322)	1
  (0, 336)	1
  (0, 375)	1
  (0, 436)	2
  (0, 437)	1
  (0, 480)	1
  (0, 489)	1
  (0, 501)	1
  (0, 556)	1
  (0, 686)	2
  (0, 709)	1
  (0, 724)	1
  (0, 785)	1
  (0, 816)	2
  (0, 1029)	1
  (0, 1072)	2
  (0, 1157)	1
  (0, 1264)	1
  (0, 1287)	1
  (0, 1387)	1
  (0, 1424)	1
  (0, 1475)	1
  (0, 1546)	1
  (0, 1555)	1 
Prediction ----> True

Test Data ---->   (0, 181)	1
  (0, 249)	1
  (0, 256)	1
  (0, 336)	1
  (0, 489)	1
  (0, 630)	1
  (0, 640)	1
  (0, 712)	1
  (0, 816)	1
  (0, 854)	1
  (0, 882)	1
  (0, 915)	1
  (0, 1