In [10]:
import warnings
import pandas as pd
import numpy as np
import nltk
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
warnings.filterwarnings('ignore')

In [4]:
def load_data():
    directory = os.path.dirname(os.path.realpath(''))
    directory = os.path.join(directory, 'COMP237_GroupProject',"YouTube-Spam-Collection-v1")
    files = [f for f in os.listdir(directory)]

    # Create dataframe
    # Columns: COMMENT_ID, AUTHOR, DATE, CONTENT, TAG
    result = pd.concat(
        (pd.read_csv(os.path.join(directory, f)) for f in files))

    # Content and class matter, keep two columns only
    result = result[['CONTENT', 'CLASS']]

    # Show is any missing data
    print("============ Load File ============")
    print("Has na: data", result.isna().any())
    print("Has null data: ", result.isnull().any())
    print("Preview")
    print(result.head())
    return result


data = load_data()


Has na: data CONTENT    False
CLASS      False
dtype: bool
Has null data:  CONTENT    False
CLASS      False
dtype: bool
Preview
                                             CONTENT  CLASS
0  <a href="http://www.youtube.com/watch?v=KQ6zr6...      0
1                                   wierd but funny﻿      0
2  Hey guys, I&#39;m a human.<br /><br /><br />Bu...      1
3       Party Rock....lol...who wants to shuffle!!!﻿      0
4                                        Party rock﻿      0


In [5]:
def word_Lemmatizer(texts):
    return ''.join([nltk.WordNetLemmatizer().lemmatize(word=x)for x in texts])

def word_PorterStemmer(texts):
    return ''.join([nltk.PorterStemmer().stem(word=x)for x in texts])

def word_SnowballStemmer( texts):
    return ''.join([nltk.SnowballStemmer(language='english').stem(x)for x in texts])

def word_LancasterStemmer( texts):
    return ''.join([nltk.LancasterStemmer().stem(word=x)for x in texts])

def Lemmatizer(data):
    tmp = data.copy()
    tmp['CONTENT'] = tmp.apply(
        lambda x: word_Lemmatizer(x['CONTENT']), axis=1)
    return tmp


def PorterStemmer(data):
    tmp = data.copy()
    tmp['CONTENT'] = tmp.apply(
        lambda x: word_PorterStemmer(x['CONTENT']), axis=1)
    return tmp


def SnowballStemmer(data):
    tmp = data.copy()
    tmp['CONTENT'] = tmp.apply(
        lambda x: word_SnowballStemmer(x['CONTENT']), axis=1)
    return tmp


def LancasterStemmer(data):
    tmp = data.copy()
    tmp['CONTENT'] = tmp.apply(
        lambda x: word_LancasterStemmer(x['CONTENT']), axis=1)
    return tmp


In [43]:
# Change mid_df for adjust nmber of vocabulary will be use, if the word frequency lower than min_df, it won't showw
count_vectorizer = CountVectorizer(stop_words='english', min_df=1)
data_vectorized = count_vectorizer.fit_transform(
    [content for content in data['CONTENT']])
data_vectorized =pd.DataFrame(data_vectorized.toarray(
), columns=count_vectorizer.get_feature_names_out())


In [7]:
data_vectorized


Unnamed: 0,00,000,10,100,15,17,19,20,200,2013,...,yall,yeah,year,years,yo,young,youtu,youtube,youtuber,zonepa
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1952,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1953,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Category Prediction

In [14]:
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(data_vectorized)
classifier = MultinomialNB().fit(train_tfidf, data['CLASS'])


In [29]:
test_data = ['New Sony TV is released this week', 'I bought a new car',
             'New drug has been invented for COVID19',
             'I hate maple leaf'
            ]

In [44]:
test_cnt = count_vectorizer.fit_transform(test_data)
test_cnt


<4x13 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [41]:
test_cnt

<4x13 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [58]:
for data, category in zip(test_data, predictions):
    print('\nTest Data', "---->" ,data,'\nPrediction', "---->" ,
          category_map[train_data.target_names[category]])


Test Data ----> New Sony TV is released this week 
Prediction ----> Electronics

Test Data ----> I bought a new car 
Prediction ----> Autos

Test Data ----> New drug has been invented for COVID19 
Prediction ----> Medicine

Test Data ----> I hate maple leaf 
Prediction ----> Hockey
