# Imports

In [67]:
import pandas as pd
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Prepare Training Data

### Granulare Benchmarks:
- Speichergröße (Json, Excel, vs. Parquet)
- Inferenz Dauer (Sklearn, Spark ML)
- Troughput (Spark Streaming, Wie viele Tweets pro Sekunde verarbeiten)

### Architektur
Architektur Training:
- Data Lake (Annotierte Trainingsdaten ca. 1.2 Mio Tweets)
- Trainingspipeline (Spark ML) > Modell für Inferenz

Architektur Live-Hate-Classification:
- Data Source (Live simuliert mit Twitter Grabs ca. 20 GB lokal)
- Inferenzpipeline (Spark Streaming)
- Data Lake (Gefunde Hate Speech wird gespeichert)

Architektur Hate-Report
- Data Source (Gefunde Hate Speech)
- Analysen (Wörter, Nutzer, Ländern, ...)

Setup

In [1]:
import findspark

findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, lower
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


MAX_MEMORY = "6g"
spark = SparkSession.builder \
                    .appName('multi_class_text_classifiter')\
                    .master("local[*]") \
                    .config("spark.executor.memory", MAX_MEMORY) \
                    .config("spark.driver.memory", MAX_MEMORY) \
                    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.3.1") \
                    .getOrCreate()

print("Apache Spark version: ", spark.version)
spark

Apache Spark version:  3.3.1


Load Data from Data Lake

In [3]:
spark_df = spark.read.parquet('../data/parquet_data')
spark_df = spark_df.withColumnRenamed("tweet_text","text")
spark_df.count()

1198584

In [6]:
# spark_df = spark_df.withColumn("text",regexp_replace(col('text'), '\d+', ''))
# spark_df.show(5)

+-------------------+--------------------+--------------+
|              index|          tweet_text|majority_label|
+-------------------+--------------------+--------------+
|1108866829991272448|@ finna fuck Pont...|             0|
|1058874314303320064|t don mind me, ’ ...|             1|
|1109486326477438976|A Law played jude...|             0|
|1062399239337140224|Review of: Heart ...|             0|
|1113926202006360064|Nigga when the yo...|             0|
+-------------------+--------------------+--------------+
only showing top 5 rows



In [6]:
def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "(https?\://)\S+", "") # Remove links
  c = regexp_replace(c, "(\\n)|\n|\r|\t", "") # Remove CR, tab, and LR
  c = regexp_replace(c, "(?:(?:[0-9]{2}[:\/,]){2}[0-9]{2,4})", "") # Remove dates
  c = regexp_replace(c, "@([A-Za-z0-9_]+)", "") # Remove usernames
  c = regexp_replace(c, "[0-9]", "") # Remove numbers
  c = regexp_replace(c, "\:|\/|\#|\.|\?|\!|\&|\"|\,", "") # Remove symbols
  return c

spark_df = spark_df.withColumn("text", clean_text(col("text")))

spark_df.show(5)

+-------------------+--------------------+--------------+
|              index|                text|majority_label|
+-------------------+--------------------+--------------+
|1108866829991272448|@ finna fuck pont...|             0|
|1058874314303320064|t don mind me ’ i...|             1|
|1109486326477438976|a law played jude...|             0|
|1062399239337140224|review of heart b...|             0|
|1113926202006360064|nigga when the yo...|             0|
+-------------------+--------------------+--------------+
only showing top 5 rows



In [38]:
# spark_df_sample =  spark_df
spark_df_sample = spark_df#.sample() #fraction=0.1
spark_df_sample.count()

1198584

# Tokenization und Word2Vec

In [24]:
# from pyspark.ml.feature import Word2Vec
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import Tokenizer
# from pyspark.ml.feature import StopWordsRemover

# # 'We hate religion' > 'We' 'hate' 'religion'
# tokenizer = Tokenizer(inputCol="text", outputCol="tokens")

# # 'We' > (0.000, 0.032432, ...) 300 Dimensionen
# w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="tokens", outputCol="features")

# doc2vec_pipeline = Pipeline(stages=[tokenizer, w2v])
# doc2vec_model = doc2vec_pipeline.fit(spark_df_sample)
# doc2vecs_df = doc2vec_model.transform(spark_df_sample)

# doc2vec_model.write().overwrite().save("../models/prep_tok2vec")

# Tokenization und TF

In [44]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer


# 'We hate religion' > 'We' 'hate' 'religion'
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W")

# Remove stop words
stopwordsRemover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

# Term frequency
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

doc2tf_pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors])
doc2tf_model = doc2tf_pipeline.fit(spark_df_sample)
doc2tf_df = doc2tf_model.transform(spark_df_sample)

In [27]:
doc2tf_model.write().overwrite().save("../models/prep_tok2tf")

# Model Training

In [43]:
# Data from Tok2Vec
# hate_train_df, hate_test_df = doc2vecs_df.randomSplit([0.8, 0.2])

# Data from Tok2Tf
hate_train_df, hate_test_df = doc2tf_df.randomSplit([0.8, 0.2])

In [42]:
print("Training Dataset Count: " + str(hate_train_df.count()))
hate_train_df.show(5)
times_hate = hate_train_df.filter(hate_train_df['majority_label'] > 0.0).count()
print(f'Times hate in training: {times_hate}')
times_not_hate = hate_train_df.filter(hate_train_df['majority_label'] == 0.0).count()
print(f'Times not hate in training: {times_not_hate}')

print("Test Dataset Count: " + str(hate_test_df.count()))
hate_test_df.show(5)
times_hate = hate_test_df.filter(hate_test_df['majority_label'] > 0.0).count()
print(f'Times hate in test: {times_hate}')
times_not_hate = hate_test_df.filter(hate_test_df['majority_label'] == 0.0).count()
print(f'Times not hate in test: {times_not_hate}')

Training Dataset Count: 958927
+-------------------+--------------------+--------------+--------------------+--------------------+--------------------+
|              index|                text|majority_label|              tokens|            filtered|            features|
+-------------------+--------------------+--------------+--------------------+--------------------+--------------------+
|1023940590382268416|Eamon0303 @ @ CNN...|             0|[eamon0303, cnn, ...|[eamon0303, cnn, ...|(10000,[0,1,17,12...|
|1023940826882293760|@ meloIigya re yo...|             1|[meloiigya, re, y...|[meloiigya, re, b...|(10000,[0,1,10,43...|
|1023940897346658304|Some of y ’ just ...|             0|[some, of, y, jus...|[y, darts, throwi...|(10000,[0,1,42,47...|
|1023942214844657664|Ye ‘ either sound...|             0|[ye, either, soun...|[ye, either, soun...|(10000,[0,1,3,30,...|
|1023942220838264832|@ wannabwinehouse...|             0|[wannabwinehouse,...|[wannabwinehouse,...|(10000,[0,1,3,415...|
+

## Logistic Regression

In [29]:
# from pyspark.ml.classification import LogisticRegression

# lr_classifier = LogisticRegression(family="multinomial", labelCol="majority_label", featuresCol="features")

# lr_classifier_pipeline = Pipeline(stages=[lr_classifier])
# lr_trained_pipeline = lr_classifier_pipeline.fit(hate_train_df)
# predictions = lr_trained_pipeline.transform(hate_test_df)

# lr_model_evaluator = MulticlassClassificationEvaluator(
#     labelCol="majority_label", predictionCol="prediction", metricName="accuracy")

# accuracy = lr_model_evaluator.evaluate(predictions)
# print("Accuracy = %g" % (accuracy))

# times_hate = predictions.filter(predictions['prediction'] == 1.0).count()
# print(f'Times hate detected: {times_hate}')
# times_not_hate = predictions.filter(predictions['prediction'] == 0.0).count()
# print(f'Times not hate detected: {times_not_hate}')

Accuracy = 0.774575
Times hate detected: 4314
Times not hate detected: 19694


Save and Load Logistic Regression

In [30]:
# lr_trained_pipeline.write().overwrite().save("../models/model_lr")

## Naive Bayes

In [46]:
from pyspark.ml.classification import NaiveBayes
classifier = NaiveBayes(smoothing=1, labelCol="majority_label", featuresCol="features")

classifier_pipeline = Pipeline(stages=[classifier])
predictions = classifier_pipeline.fit(hate_train_df).transform(hate_test_df)

Evaluation

In [88]:
model_evaluator = MulticlassClassificationEvaluator(
    labelCol="majority_label", predictionCol="prediction", metricName="accuracy")

accuracy = model_evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))

times_hate = predictions.filter(predictions['prediction'] == 1.0).count()
print(f'Times hate detected: {times_hate}')
times_not_hate = predictions.filter(predictions['prediction'] == 0.0).count()
print(f'Times not hate detected: {times_not_hate}')

Accuracy = 0.75724
Times hate detected: 4691
Times not hate detected: 19308


Save and Load Naive Bayes

In [32]:
classifier_pipeline.write().overwrite().save("../models/model_nb")

In [33]:
predictions.filter(predictions['prediction'] == 1.0).show(5)

+-------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|              index|                text|majority_label|              tokens|            filtered|            features|       rawPrediction|         probability|prediction|
+-------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|1024172939191754752|' she dyke energy...|             1|[she, dyke, energ...|[dyke, energy, ra...|(10000,[0,1,6,7,2...|[-35.057839778119...|[0.15343380004494...|       1.0|
|1024173075607298048|called me he that...|             0|[called, me, he, ...|[called, day, wor...|(10000,[0,1,3,4,4...|[-47.258622461489...|[0.46748544250673...|       1.0|
|1024228972417220608|calls this anyone...|             0|[calls, this, any...|[calls, anyone, a...|(10000,[0,1,3,361...|[-45.19490

# MLlib implementation

In [89]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline

spark_df_sample = spark_df #.sample(fraction=0.1) 
spark_df_sample.count()

1198584

In [90]:
hate_train_df, hate_test_df = spark_df_sample.randomSplit([0.8, 0.2])

In [91]:
# 'We hate religion' > 'We' 'hate' 'religion'
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W")

# Remove stop words
stopwordsRemover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

# Term frequency
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

# Classifier
classifier = NaiveBayes(smoothing=1, labelCol="majority_label", featuresCol="features")

inference_pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, classifier])
trained_inference_pipeline = inference_pipeline.fit(hate_train_df)
predictions = trained_inference_pipeline.transform(hate_test_df)

Save MLlib Pipeline

In [92]:
trained_inference_pipeline.write().overwrite().save("../models/mllib_model_nb")

In [85]:
model_evaluator = MulticlassClassificationEvaluator(
    labelCol="majority_label", predictionCol="prediction", metricName="accuracy")

accuracy = model_evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))

# times_hate = predictions.filter(predictions['prediction'] == 1.0).count()
# print(f'Times hate detected: {times_hate}')
# times_not_hate = predictions.filter(predictions['prediction'] == 0.0).count()
# print(f'Times not hate detected: {times_not_hate}')

Accuracy = 0.75724


# SKlearn implementation

In [93]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

pandas_df = spark_df.toPandas()
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1198584 entries, 0 to 1198583
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   index           1198584 non-null  int64 
 1   text            1198584 non-null  object
 2   majority_label  1198584 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 27.4+ MB


In [94]:
pandas_df_sample = pandas_df #.sample(frac=0.1)

In [95]:
X_train, X_test, y_train, y_test = train_test_split(pandas_df_sample['text'], pandas_df_sample['majority_label'], test_size=0.2)

In [96]:
inference_pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english', min_df=2, max_features=10000)),
    ('clf', MultinomialNB()),
])


trained_inference_pipeline = inference_pipeline.fit(X_train, y_train)
y_pred = trained_inference_pipeline.predict(X_test)

Save Sklearn Pipeline

In [98]:
import joblib
joblib.dump(trained_inference_pipeline, '../models/sklearn_model_nb.pkl')
# pipeline = joblib.load('pipeline.pkl')

['../models/sklearn_model_nb.pkl']

In [None]:
accuracy_score(y_test, y_pred)

0.7656849657934257