In [28]:
import neurolab as nl
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from Model import Model
import neurolab as nl
from enum import Enum
import nltk
from pandas import DataFrame

RANDOMS_SEED = 420
TEST_SIZE = 0.4


In [26]:

class PreprocessName(Enum):
    LEMMATIZE = "lemm"
    PORTER_STEMMER = "porter"
    SNOWBALL_STEMMER = "snowball"
    LANCASTER_STEMMER = "lancaster"


class Preprocess:

    def __init__(self):
        self.data = self.load_data()

    def load_data(self):
        directory = os.path.dirname(os.path.realpath(''))
        directory = os.path.join(
            directory, "COMP237_GroupProject", "YouTube-Spam-Collection-v1")
        files = [f for f in os.listdir(directory)]

        # Create dataframe
        # Columns: COMMENT_ID, AUTHOR, DATE, CONTENT, TAG
        result = pd.concat(
            (pd.read_csv(os.path.join(directory, f)) for f in files))

        # Content and class matter, keep two columns only
        result = result[['CONTENT', 'CLASS']]
        return result

    def word_Lemmatizer(self, texts):
        return ''.join([nltk.WordNetLemmatizer().lemmatize(word=x)for x in texts])

    def word_PorterStemmer(self, texts):
        return ''.join([nltk.PorterStemmer().stem(word=x)for x in texts])

    def word_SnowballStemmer(self, texts):
        return ''.join([nltk.SnowballStemmer(language='english').stem(x)for x in texts])

    def word_LancasterStemmer(self, texts):
        return ''.join([nltk.LancasterStemmer().stem(word=x)for x in texts])

    def dataframe_Lemmatizer(self):
        tmp = self.data.copy()
        tmp['CONTENT'] = tmp.apply(
            lambda x: self.word_Lemmatizer(x['CONTENT']), axis=1)
        return tmp

    def dataframe_PorterStemmer(self):
        tmp = self.data.copy()
        tmp['CONTENT'] = tmp.apply(
            lambda x: self.word_PorterStemmer(x['CONTENT']), axis=1)
        return tmp

    def dataframe_SnowballStemmer(self):
        tmp = self.data.copy()
        tmp['CONTENT'] = tmp.apply(
            lambda x: self.word_SnowballStemmer(x['CONTENT']), axis=1)
        return tmp

    def dataframe_LancasterStemmer(self):
        tmp = self.data.copy()
        tmp['CONTENT'] = tmp.apply(
            lambda x: self.word_LancasterStemmer(x['CONTENT']), axis=1)
        return tmp

    def count_vectorizer(self, data):
        # Change mid_df for adjust nmber of vocabulary will be use, if the word frequency lower than min_df, it won't showw
        count_vectorizer = CountVectorizer(stop_words='english', min_df=6)
        bag_of_words = count_vectorizer.fit_transform(
            [content for content in data['CONTENT']])
        return pd.DataFrame(bag_of_words.toarray(
        ), columns=count_vectorizer.get_feature_names_out())

    def padding():
        return ''

    def get_tokenlized_df(self, preprocess: PreprocessName):
        df = None
        if preprocess == PreprocessName.LANCASTER_STEMMER:
            df = self.dataframe_LancasterStemmer()
        elif preprocess == PreprocessName.LEMMATIZE:
            df = self.dataframe_Lemmatizer()
        elif preprocess == PreprocessName.PORTER_STEMMER:
            df = self.dataframe_PorterStemmer()
        elif preprocess == PreprocessName.SNOWBALL_STEMMER:
            df = self.dataframe_SnowballStemmer()
        return self.count_vectorizer(df)


In [27]:
class Model:

    def __init__(self, token_data: DataFrame, class_data: DataFrame):
        self.token_data = token_data
        self.class_data = class_data
        self.x_test, self.x_train, self.y_test, self.y_train = self.train_test_split()

    def train_test_split(self, test_size: float = 0.3, random_seed: int = 420):
        return train_test_split(
            self.token_data, self.class_data, test_size=test_size, random_state=random_seed)

    def min_max_pair(self):
        return [[self.token_data.loc[:, col].min(), self.token_data.loc[:, col].max()] for col in self.token_data.columns]


In [14]:
# Preprocess
preprocess = Preprocess()
lancaster_df = preprocess.get_tokenlized_df(PreprocessName.LANCASTER_STEMMER)
# lemmatize_df = preprocess.get_tokenlized_df(PreprocessName.LEMMATIZE)
# porter_df = preprocess.get_tokenlized_df(PreprocessName.PORTER_STEMMER)
# snowball_df = preprocess.get_tokenlized_df(PreprocessName.SNOWBALL_STEMMER)



In [22]:
preprocess.data['CLASS']


0      0
1      0
2      1
3      0
4      0
      ..
345    0
346    0
347    1
348    1
349    0
Name: CLASS, Length: 1956, dtype: int64

In [23]:
# Split datam into train and test
model = Model(lancaster_df, preprocess.data['CLASS'])

# Create new network
nn_ex1 = nl.net.newff(model.min_max_pair(), [100, 50, 25])


In [25]:
error_progress_ex1 = nn_ex1.train(model.x_train, model.y_train,
                                  epochs=1000, show=15, goal=0.00001)


AssertionError: 