In [43]:
import string
import nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction import text
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


#### Functions

In [24]:
def clean_text (text) :
    words_List = nltk.word_tokenize(text)
    final_list = [elto for elto in words_List if elto not in STOP_WORDS]
    return " ".join(final_list)

#### Constants

In [25]:
STOP = text.ENGLISH_STOP_WORDS
STOP_WORDS = list(STOP) + list(string.punctuation)

### Read two .csv files and merge them into one dataset 

In [26]:
data_folder = 'data\\'
channel1 = 'iitztimmy'
channel2 = 'pgl'

channel1_csv = data_folder + channel1 + '.csv'
channel2_csv = data_folder + channel2 + '.csv'

df1 = pd.read_csv(channel1_csv, index_col=0)
df2 = pd.read_csv(channel2_csv, index_col=0)

df = pd.concat([df1, df2], ignore_index=True) 

df.replace(to_replace={'channel': {'#iitztimmy':0, '#pgl':1}}, inplace=True)
df.drop(columns='datetime', inplace=True)

print(df.shape)

(6000, 3)


### Preprocessing

In [27]:
# Notice that we want Sleep = SLEEP = SlEEp = sleeP ETC
df.loc[:, 'message'] = df.loc[:, 'message'].str.lower()

# Drop NaN values
df.dropna(inplace=True, subset=['channel', 'message'])

# Remove words like: can, could, will, been, would...
df.loc[:, 'message'] = df.loc[:, 'message'].apply(clean_text)

# stem separate words
stemmer = SnowballStemmer("english")
df.loc[:, 'message'] = df.loc[:, 'message'].astype(str).str.split()
df.loc[:, 'message'] = df.loc[:, 'message'].apply(lambda x: [stemmer.stem(word) for word in x])

# Remove rows with empty messages
df = df[df['message'].astype(bool)]

# Rejoin list of messages to single string message separated by <space>
df.loc[:, 'message'] = df.loc[:, 'message'].apply(lambda x: ' '.join(x))

df

Unnamed: 0,username,channel,message
0,cgtiwnl,0,mfker got gang lmao
1,out_smoked,0,’ d timmi
2,fourthhokage20,0,ouch
3,streamelements,0,new youtub channel arriv iitzaaaa subscrib sec...
4,thedarky5,0,sad
...,...,...,...
5995,blessed909,1,gg
5996,bydrop,1,astrali realis awara
5997,bloombird,1,lol konfig push middl xd gogogo
5998,emzee17,1,astrali fan 2022 kekw kekw


#### Rename columns to fit PySpark convention (Probably not necessary) and get features and targets

In [28]:
df.rename(columns={'channel': 'label'}, inplace=True)

final_df = df.loc[:, ['message', 'label']]
final_df

Unnamed: 0,message,label
0,mfker got gang lmao,0
1,’ d timmi,0
2,ouch,0
3,new youtub channel arriv iitzaaaa subscrib sec...,0
4,sad,0
...,...,...
5995,gg,1
5996,astrali realis awara,1
5997,lol konfig push middl xd gogogo,1
5998,astrali fan 2022 kekw kekw,1


### Convert pandas DataFrame to PySpark DataFrame

In [29]:
spark_df = spark.createDataFrame(final_df)

# Uncomment if you want to be as excited as I am right now 
spark_df.printSchema()
# spark_df.show()

root
 |-- message: string (nullable = true)
 |-- label: long (nullable = true)



In [30]:
spark_df.head(5)
spark_df.printSchema()

root
 |-- message: string (nullable = true)
 |-- label: long (nullable = true)



In [31]:
# break the sentence into a list of words
tokenizer = Tokenizer(inputCol="message", outputCol="words")
words_data = tokenizer.transform(spark_df)

# TF section
hashing_TF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=2000)
featurized_data = hashing_TF.transform(words_data)

# IDF section
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_data)

rescaled_data = idf_model.transform(featurized_data)

rescaled_data.select('label', 'features').show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(2000,[10,1126,17...|
|    0|(2000,[501,890,13...|
|    0|(2000,[346],[6.73...|
|    0|(2000,[344,371,38...|
|    0|(2000,[438],[5.63...|
|    0|(2000,[555],[6.88...|
|    0|(2000,[689],[4.99...|
|    0|(2000,[1064],[13....|
|    0|(2000,[424,526],[...|
|    0|(2000,[157],[4.20...|
|    0|(2000,[564,837,11...|
|    0|(2000,[606],[6.28...|
|    0|(2000,[104],[17.3...|
|    0|(2000,[1531],[5.4...|
|    0|(2000,[7,22,159,2...|
|    0|(2000,[1375],[4.2...|
|    0|(2000,[149,370,44...|
|    0|(2000,[1620],[6.8...|
|    0|(2000,[1620],[6.8...|
|    0|(2000,[124],[7.07...|
+-----+--------------------+
only showing top 20 rows



### Classifier training and evaluation

In [32]:
splits = rescaled_data.randomSplit(weights=[0.8, 0.2], seed=42)

train = splits[0]
test = splits[1]

nb = NaiveBayes(smoothing=0.1, modelType='multinomial')

model = nb.fit(train)

In [33]:
predictions = model.transform(test)
predictions.show()

evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

accuracy = evaluator.evaluate(predictions)
print(f'Test set accuracy = {str(accuracy)}')

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             message|label|               words|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|'s 10 min 3 day i...|    0|['s, 10, min, 3, ...|(2000,[159,206,11...|(2000,[159,206,11...|[-259.68579862189...|[1.0,3.8743596863...|       0.0|
|            's oiler|    0|         ['s, oiler]|(2000,[441,1388],...|(2000,[441,1388],...|[-45.199009860193...|[1.0,2.2737497084...|       0.0|
|                 0-0|    0|               [0-0]|  (2000,[831],[1.0])|(2000,[831],[6.88...|[-56.424825093964...|[0.53532440454198...|       0.0|
|              10 min|    0|           [10, min]|(2000,[159,1953],...|(2000,[159,1953],...|[-50.388218426707...|[0.99994570713437.

In [36]:
# model.save("models/multinomialNB")
model.write().overwrite().save("models/multinomialNB")

In [37]:
# NaiveBayesModel.load('models\\multinomialNB')

NaiveBayesModel: uid=NaiveBayes_af335a176991, modelType=multinomial, numClasses=2, numFeatures=2000