In [28]:
import string
import nltk
# EDIT
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction import text

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, when
from pyspark.sql.types import StringType
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


In [29]:
sc

In [30]:
spark

In [31]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [32]:
def clean_text (text) :
    words_List = nltk.word_tokenize(text)
    final_list = [elto for elto in words_List if elto not in STOP_WORDS]
    return " ".join(final_list)

In [33]:
def preprocess(df):
    # Include the username in the text to increase classification accuracy
    # From tests we can see that adding it before or after processing the text data doesn't matter
    df.loc[:, 'message'] = df['message'] + ' ' + df['username']

    # Notice that we want Sleep = SLEEP = SlEEp = sleeP ETC   
    df.loc[:, 'message'] = df.loc[:, 'message'].str.lower()

    # Drop NaN values
    df.dropna(inplace=True, subset=['channel', 'message'])

    # Remove words like: can, could, will, been, would...
    df.loc[:, 'message'] = df.loc[:, 'message'].apply(clean_text)

    # stem separate words
    stemmer = SnowballStemmer("english")
    df.loc[:, 'message'] = df.loc[:, 'message'].astype(str).str.split()
    df.loc[:, 'message'] = df.loc[:, 'message'].apply(lambda x: [stemmer.stem(word) for word in x])

    # Remove rows with empty messages
    df = df[df['message'].astype(bool)]

    # Rejoin list of messages to single string message separated by <space>
    df.loc[:, 'message'] = df.loc[:, 'message'].apply(lambda x: ' '.join(x))

    df.rename(columns={'channel': 'label'}, inplace=True)

    final_df = df.loc[:, ['message', 'label']]

    # print(final_df)

    return final_df

In [34]:
MODELS_PATH = 'models\\'
MODEL = 'multinomialNB'
STOP = text.ENGLISH_STOP_WORDS
STOP_WORDS = list(STOP) + list(string.punctuation)

# Create a list of predictions to concat all predictions and later save them into .csv format
# for further processing
predictions_list = []

mapping = {0:'#loltyler1', 1:'#gothamchess'}

globals()['models_loaded'] = False
globals()['my_model'] = None

globals()['my_model'] = NaiveBayesModel.load(MODELS_PATH+MODEL)
globals()['models_loaded'] = True

In [35]:
# globals()['models_loaded'] = False
# globals()['my_model'] = None

# Toy predict function. Normally you'd use your loaded globals()['my_model'] here
# def predict(df):
#     df.show()
#     print(globals()['my_model'])
#     predictions = globals()['my_model'].transform(df)
#     # predictions = predictions.withColumn('prediction', 
#     #                     when(col('prediction') == 0, lit(mapping[0])).otherwise(lit(mapping[1])))
    
#     predictions.show()
#     return predictions.prediction

# predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to PySpark DataFrame
    df = spark.read.json(rdd)
    
    # Convert to Pandas DataFrame for preprocessing
    df_pandas = df.toPandas()
    df_pandas = preprocess(df_pandas)
    # Reconvert to PySpark DataFrame (I'm sure there is a better way to do this)
    df = spark.createDataFrame(df_pandas)

    # break the sentence into a list of words
    tokenizer = Tokenizer(inputCol="message", outputCol="words")
    words_data = tokenizer.transform(df)

    # TF section
    hashing_TF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=200000)
    featurized_data = hashing_TF.transform(words_data)

    # IDF section
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idf_model = idf.fit(featurized_data)

    rescaled_data = idf_model.transform(featurized_data)
    
    # rescaled_data.show()

    # # Utilize our predict function
    # df_withpreds = df.withColumn("pred", predict_udf(
    #     struct([df[x] for x in df.columns])
    # ))
    # df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict as we did here (you can)
    # but an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = NaiveBayesModel.load(MODELS_PATH+MODEL)
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model: 
    df_result = globals()['my_model'].transform(rescaled_data)
    df_result = df_result.withColumn('prediction', 
        when(col('prediction') == 0, lit(mapping[0])).otherwise(lit(mapping[1])))


    to_concat = df_result.toPandas()
    predictions_list.append(to_concat)

    try:
        df_result.select(['message', 'label', 'probability', 'prediction']).show()
    except Exception as e:
        pass
        # Uncomment to see what went wrong
        # print(e)

In [36]:
print(globals()['my_model'])

NaiveBayesModel: uid=NaiveBayes_b32a0b916ba2, modelType=multinomial, numClasses=2, numFeatures=200000


In [37]:
ssc = StreamingContext(sc, 10)

In [38]:
lines = ssc.socketTextStream("localhost", 8080)
lines.foreachRDD(process)

In [39]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+-------------------+----------+--------------------+------------+
|            message|     label|         probability|  prediction|
+-------------------+----------+--------------------+------------+
|     lmfao based_on|#loltyler1|[0.71830478294058...|  #loltyler1|
|        josevandamm|#loltyler1|[0.51661046677587...|  #loltyler1|
|kekw gg rosenkreutz|#loltyler1|[0.77659288613692...|  #loltyler1|
| kekw leonboonkgang|#loltyler1|[0.63481813622216...|  #loltyler1|
|       kekw dedal57|#loltyler1|[0.63481813622216...|  #loltyler1|
|       lol jeano135|#loltyler1|[0.31829476007431...|#gothamchess|
|     kekw iokyruezi|#loltyler1|[0.63481813622216...|  #loltyler1|
+-------------------+----------+--------------------+------------+

+--------------------+----------+--------------------+------------+
|             message|     label|         probability|  prediction|
+--------------------+----------+--------------------+------------+
| rank 0nepunchm4ng0d|#loltyler1|[1.0,2.1687697220...|  #l

In [40]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+--------------------+----------+--------------------+------------+
|             message|     label|         probability|  prediction|
+--------------------+----------+--------------------+------------+
|bigfroob lulw tib...|#loltyler1|[0.99995412166098...|  #loltyler1|
|bigbroth want los...|#loltyler1|[0.99999998408746...|  #loltyler1|
|mercywing1 refund...|#loltyler1|[2.56421919832332...|#gothamchess|
|winnabl rooftopin...|#loltyler1|[0.99999156524256...|  #loltyler1|
|emoneylemon cuck ...|#loltyler1|[0.55957947061297...|  #loltyler1|
|donavalen ez doub...|#loltyler1|[0.99999999990964...|  #loltyler1|
|   uptimr trillstura|#loltyler1|[0.53651122823302...|  #loltyler1|
|fuck youo chat ca...|#loltyler1|[0.99995326437445...|  #loltyler1|
|     rank titans6064|#loltyler1|[0.99999995084731...|  #loltyler1|
|titans6064 accoun...|#loltyler1|[1.0,9.0424889081...|  #loltyler1|
|    uptim trillstura|#loltyler1|[0.95375507162000...|  #loltyle

In [41]:
predictions_list

[               message       label                    words  \
 0       lmfao based_on  #loltyler1        [lmfao, based_on]   
 1          josevandamm  #loltyler1            [josevandamm]   
 2  kekw gg rosenkreutz  #loltyler1  [kekw, gg, rosenkreutz]   
 3   kekw leonboonkgang  #loltyler1    [kekw, leonboonkgang]   
 4         kekw dedal57  #loltyler1          [kekw, dedal57]   
 5         lol jeano135  #loltyler1          [lol, jeano135]   
 6       kekw iokyruezi  #loltyler1        [kekw, iokyruezi]   
 
                                          rawFeatures  \
 0  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
 1  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
 2  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
 3  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
 4  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
 5  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
 6  (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
 
                            

In [43]:
temp_df = pd.concat(predictions_list, ignore_index=True)
temp_df.head(5)

Unnamed: 0,message,label,words,rawFeatures,features,rawPrediction,probability,prediction
0,lmfao based_on,#loltyler1,"[lmfao, based_on]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-30.608744457175206, -31.544812729035403]","[0.7183047829405899, 0.28169521705941003]",#loltyler1
1,josevandamm,#loltyler1,[josevandamm],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-20.04533518829217, -20.111801514015728]","[0.5166104667758784, 0.4833895332241217]",#loltyler1
2,kekw gg rosenkreutz,#loltyler1,"[kekw, gg, rosenkreutz]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-31.786637946078258, -33.032558473326446]","[0.7765928861369273, 0.2234071138630727]",#loltyler1
3,kekw leonboonkgang,#loltyler1,"[kekw, leonboonkgang]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-22.064097375408473, -22.617040447211533]","[0.6348181362221689, 0.3651818637778312]",#loltyler1
4,kekw dedal57,#loltyler1,"[kekw, dedal57]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-22.064097375408473, -22.617040447211533]","[0.6348181362221689, 0.3651818637778312]",#loltyler1


In [44]:
correct = temp_df['label'] == temp_df['prediction']
correct.sum()
len(correct)

7342

In [45]:
correct.sum() / len(correct) * 100

84.11876872786706

In [46]:
temp_df.to_csv('predictions\multinomialNB_predictions.csv')