In [16]:
import string
import nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction import text
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [17]:
spark

#### Functions

In [18]:
def clean_text (text) :
    words_List = nltk.word_tokenize(text)
    final_list = [elto for elto in words_List if elto not in STOP_WORDS]
    return " ".join(final_list)

#### Constants

In [19]:
STOP = text.ENGLISH_STOP_WORDS
STOP_WORDS = list(STOP) + list(string.punctuation)

### Read two .csv files and merge them into one dataset 

In [20]:
data_folder = 'data\\'
channel1 = 'loltyler1'
channel2 = 'gothamchess'

channel1_csv = data_folder + channel1 + '.csv'
channel2_csv = data_folder + channel2 + '.csv'

df1 = pd.read_csv(channel1_csv, index_col=0)
df2 = pd.read_csv(channel2_csv, index_col=0)

df = pd.concat([df1, df2], ignore_index=True) 

df.replace(to_replace={'channel': {'#loltyler1':0, '#gothamchess':1}}, inplace=True)
df.drop(columns='datetime', inplace=True)

print(df.shape)

(10005, 3)


### Preprocessing

In [21]:
# Notice that we want Sleep = SLEEP = SlEEp = sleeP ETC
df.loc[:, 'message'] = df.loc[:, 'message'].str.lower()

# Drop NaN values
df.dropna(inplace=True, subset=['channel', 'message'])

# Remove words like: can, could, will, been, would...
df.loc[:, 'message'] = df.loc[:, 'message'].apply(clean_text)

# stem separate words
stemmer = SnowballStemmer("english")
df.loc[:, 'message'] = df.loc[:, 'message'].astype(str).str.split()
df.loc[:, 'message'] = df.loc[:, 'message'].apply(lambda x: [stemmer.stem(word) for word in x])

# Remove rows with empty messages
df = df[df['message'].astype(bool)]

# Rejoin list of messages to single string message separated by <space>
df.loc[:, 'message'] = df.loc[:, 'message'].apply(lambda x: ' '.join(x))

df

Unnamed: 0,username,channel,message
1,valakino,0,kekw heal
2,pidhiii,0,t1 belong d1 t1 belong d1 t1 belong d1 t1 belo...
3,w_clooney1,0,bigbroth hahashrugright curselit mingle curselit
4,river_shen_main,0,poppi n't win trade 's just bad clueless
5,mrp_ositive,0,omegalul kr joke
...,...,...,...
10000,lukagaric03,1,petpet feelsokayman
10001,clumsyrook,1,meetbal sub
10002,prestigedgamer,1,windmillll
10003,mastoblood,1,subway overr


#### Add username to message to increase accuracy

In [22]:
df.loc[:, 'message'] = df['message'] + ' ' + df['username']

#### Rename columns to fit PySpark convention (Probably not necessary) and get features and targets

In [23]:
df.rename(columns={'channel': 'label'}, inplace=True)

final_df = df.loc[:, ['message', 'label']]
final_df

Unnamed: 0,message,label
1,kekw heal valakino,0
2,t1 belong d1 t1 belong d1 t1 belong d1 t1 belo...,0
3,bigbroth hahashrugright curselit mingle cursel...,0
4,poppi n't win trade 's just bad clueless river...,0
5,omegalul kr joke mrp_ositive,0
...,...,...
10000,petpet feelsokayman lukagaric03,1
10001,meetbal sub clumsyrook,1
10002,windmillll prestigedgamer,1
10003,subway overr mastoblood,1


### Convert pandas DataFrame to PySpark DataFrame

In [24]:
spark_df = spark.createDataFrame(final_df)

# Uncomment if you want to be as excited as I am right now 
spark_df.printSchema()
# spark_df.show()

root
 |-- message: string (nullable = true)
 |-- label: long (nullable = true)



In [25]:
spark_df.head(5)
spark_df.printSchema()

root
 |-- message: string (nullable = true)
 |-- label: long (nullable = true)



In [26]:
# break the sentence into a list of words
tokenizer = Tokenizer(inputCol="message", outputCol="words")
words_data = tokenizer.transform(spark_df)

# TF section
hashing_TF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=200000)
featurized_data = hashing_TF.transform(words_data)

# IDF section
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_data)

rescaled_data = idf_model.transform(featurized_data)

rescaled_data.select('label', 'features').show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(200000,[18826,17...|
|    0|(200000,[61628,11...|
|    0|(200000,[23576,71...|
|    0|(200000,[17080,21...|
|    0|(200000,[100757,1...|
|    0|(200000,[43144,46...|
|    0|(200000,[39348,60...|
|    0|(200000,[119345,1...|
|    0|(200000,[47396,12...|
|    0|(200000,[21263,11...|
|    0|(200000,[13846,67...|
|    0|(200000,[13258,12...|
|    0|(200000,[5931,268...|
|    0|(200000,[136498,1...|
|    0|(200000,[27243,80...|
|    0|(200000,[58539,62...|
|    0|(200000,[34051,36...|
|    0|(200000,[21869,19...|
|    0|(200000,[100757,1...|
|    0|(200000,[108697,1...|
+-----+--------------------+
only showing top 20 rows



### Classifier training and evaluation

In [27]:
splits = rescaled_data.randomSplit(weights=[0.8, 0.2], seed=42)

train = splits[0]
test = splits[1]

nb = NaiveBayes(smoothing=0.1, modelType='multinomial')

model = nb.fit(train)

In [28]:
predictions = model.transform(test)
predictions.show()

evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

accuracy = evaluator.evaluate(predictions)
print(f'Test set accuracy = {str(accuracy)}')

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             message|label|               words|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|'d funni make vid...|    0|['d, funni, make,...|(200000,[9619,246...|(200000,[9619,246...|[-631.96558218836...|[2.59263589926952...|       1.0|
|'s just sad go wa...|    0|['s, just, sad, g...|(200000,[21388,34...|(200000,[21388,34...|[-475.58046557256...|[0.99999997156466...|       0.0|
|      -1 egusplosion|    0|   [-1, egusplosion]|(200000,[115979,1...|(200000,[115979,1...|[-130.36629473927...|[1.0,5.0349297644...|       0.0|
|-200 bigbroth bop...|    0|[-200, bigbroth, ...|(200000,[91560,10...|(200000,[91560,10...|[-184.10270538766...|[1.0,3.1029660591.

In [29]:
model.save("models/multinomialNB")
# model.write().overwrite().save("models/multinomialNB")

In [None]:
# NaiveBayesModel.load('models\\multinomialNB')