In [177]:
import string
import nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction import text
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#### Functions

In [178]:
def clean_text (text) :
    words_List = nltk.word_tokenize(text)
    final_list = [elto for elto in words_List if elto not in STOP_WORDS]
    return " ".join(final_list)

#### Constants

In [179]:
STOP = text.ENGLISH_STOP_WORDS
STOP_WORDS = list(STOP) + list(string.punctuation)

### Read two .csv files and merge them into one dataset 

In [180]:
data_folder = 'data\\'
channel1 = 'iitztimmy'
channel2 = 'pgl'

channel1_csv = data_folder + channel1 + '.csv'
channel2_csv = data_folder + channel2 + '.csv'

df1 = pd.read_csv(channel1_csv, index_col=0)
df2 = pd.read_csv(channel2_csv, index_col=0)

df = pd.concat([df1, df2], ignore_index=True) 

df.replace(to_replace={'channel': {'#iitztimmy':0, '#pgl':1}}, inplace=True)
df.drop(columns='datetime', inplace=True)

print(df.shape)

(6000, 3)


### Preprocessing

In [181]:
# Notice that we want Sleep = SLEEP = SlEEp = sleeP ETC
df.loc[:, 'message'] = df.loc[:, 'message'].str.lower()

# Drop NaN values
df.dropna(inplace=True, subset=['channel', 'message'])

# Remove words like: can, could, will, been, would...
df.loc[:, 'message'] = df.loc[:, 'message'].apply(clean_text)

# stem separate words
stemmer = SnowballStemmer("english")
df.loc[:, 'message'] = df.loc[:, 'message'].astype(str).str.split()
df.loc[:, 'message'] = df.loc[:, 'message'].apply(lambda x: [stemmer.stem(word) for word in x])

# Remove rows with empty messages
df = df[df['message'].astype(bool)]

# Rejoin list of messages to single string message separated by <space>
df.loc[:, 'message'] = df.loc[:, 'message'].apply(lambda x: ' '.join(x))

df

Unnamed: 0,username,channel,message
0,cgtiwnl,0,mfker got gang lmao
1,out_smoked,0,’ d timmi
2,fourthhokage20,0,ouch
3,streamelements,0,new youtub channel arriv iitzaaaa subscrib sec...
4,thedarky5,0,sad
...,...,...,...
5995,blessed909,1,gg
5996,bydrop,1,astrali realis awara
5997,bloombird,1,lol konfig push middl xd gogogo
5998,emzee17,1,astrali fan 2022 kekw kekw


#### Rename columns to fit PySpark convention (Probably not necessary) and get features and targets

In [182]:
df.rename(columns={'channel': 'label'}, inplace=True)

final_df = df.loc[:, ['message', 'label']]
final_df

Unnamed: 0,message,label
0,mfker got gang lmao,0
1,’ d timmi,0
2,ouch,0
3,new youtub channel arriv iitzaaaa subscrib sec...,0
4,sad,0
...,...,...
5995,gg,1
5996,astrali realis awara,1
5997,lol konfig push middl xd gogogo,1
5998,astrali fan 2022 kekw kekw,1


### Convert pandas DataFrame to PySpark DataFrame

In [183]:
spark_df = spark.createDataFrame(final_df)

# Uncomment if you want to be as excited as I am right now 
spark_df.printSchema()
# spark_df.show()

root
 |-- message: string (nullable = true)
 |-- label: long (nullable = true)



In [184]:
spark_df.head(5)
spark_df.printSchema()

root
 |-- message: string (nullable = true)
 |-- label: long (nullable = true)



In [185]:
# break the sentence into a list of words
tokenizer = Tokenizer(inputCol="message", outputCol="words")
words_data = tokenizer.transform(spark_df)

# TF section
hashing_TF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=2000)
featurized_data = hashing_TF.transform(words_data)

# IDF section
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_data)

rescaled_data = idf_model.transform(featurized_data)

rescaled_data.select('label', 'features').show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(20,[3,6,10,11],[...|
|    0|(20,[1,10,12],[2....|
|    0|(20,[6],[1.930570...|
|    0|(20,[2,4,5,7,11,1...|
|    0|(20,[18],[2.31060...|
|    0|(20,[15],[2.14526...|
|    0|(20,[9],[2.492237...|
|    0|(20,[4],[4.431827...|
|    0|(20,[4,6],[2.2159...|
|    0|(20,[17],[2.09157...|
|    0|(20,[1,4,12,17],[...|
|    0|(20,[6],[1.930570...|
|    0|(20,[4],[6.647740...|
|    0|(20,[11],[2.30207...|
|    0|(20,[2,3,7,12,19]...|
|    0|(20,[15],[2.14526...|
|    0|(20,[2,3,5,6,7,8,...|
|    0|(20,[0],[2.295307...|
|    0|(20,[0],[2.295307...|
|    0|(20,[4],[2.215913...|
+-----+--------------------+
only showing top 20 rows



### Classifier training and evaluation

In [190]:
splits = rescaled_data.randomSplit(weights=[0.8, 0.2], seed=42)

train = splits[0]
test = splits[1]

nb = NaiveBayes(smoothing=0.1, modelType='multinomial')

model = nb.fit(train)

In [191]:
predictions = model.transform(test)
predictions.show()

evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

accuracy = evaluator.evaluate(predictions)
print(f'Test set accuracy = {str(accuracy)}')

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             message|label|               words|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|'s 10 min 3 day i...|    0|['s, 10, min, 3, ...|(20,[1,6,8,13,19]...|(20,[1,6,8,13,19]...|[-42.292691079385...|[0.99912316806066...|       0.0|
|            's oiler|    0|         ['s, oiler]|(20,[1,8],[1.0,1.0])|(20,[1,8],[2.1152...|[-13.857129226593...|[0.59994793880206...|       0.0|
|                 0-0|    0|               [0-0]|     (20,[11],[1.0])|(20,[11],[2.30207...|[-8.5005188499021...|[0.32760197849977...|       1.0|
|              10 min|    0|           [10, min]|(20,[13,19],[1.0,...|(20,[13,19],[2.08...|[-13.610436642826...|[0.86865954862321.