# Imports

In [1]:
import pandas as pd
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Prepare Training Data

### Architektur
Architektur Training:
- Data Lake (Annotierte Trainingsdaten ca. 1.2 Mio Tweets)
- Trainingspipeline (Spark ML) > Modell für Inferenz

Architektur Live-Hate-Classification:
- Data Source (Live simuliert mit Twitter Grabs ca. 20 GB lokal)
- Inferenzpipeline (Spark Streaming)
- Data Lake (Gefunde Hate Speech wird gespeichert)

Architektur Hate-Report
- Data Source (Gefunde Hate Speech)
- Analysen (Wörter, Nutzer, Ländern, ...)

Setup

In [2]:
import findspark

findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, lower
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


MAX_MEMORY = "6g"
spark = SparkSession.builder \
                    .appName('multi_class_text_classifiter')\
                    .master("local[4]") \
                    .config("spark.executor.memory", MAX_MEMORY) \
                    .config("spark.driver.memory", MAX_MEMORY) \
                    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.3.1") \
                    .getOrCreate()

print("Apache Spark version: ", spark.version)
spark

Apache Spark version:  3.3.1


Trainingsdaten aus dem Data Lake (Parquet File) laden

In [3]:
spark_df = spark.read.parquet('../data/parquet_data')
spark_df = spark_df.withColumnRenamed("tweet_text","text")
spark_df.count()

1198584

# Data Preparation
Methode für die Bereinigung der Texte (z.B. Groß- und Kleinschreibung, Sondernzeichen, ...)

In [4]:
def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "(https?\://)\S+", "") # Remove links
  c = regexp_replace(c, "(\\n)|\n|\r|\t", "") # Remove CR, tab, and LR
  c = regexp_replace(c, "(?:(?:[0-9]{2}[:\/,]){2}[0-9]{2,4})", "") # Remove dates
  c = regexp_replace(c, "@([A-Za-z0-9_]+)", "") # Remove usernames
  c = regexp_replace(c, "[0-9]", "") # Remove numbers
  c = regexp_replace(c, "\:|\/|\#|\.|\?|\!|\&|\"|\,", "") # Remove symbols
  return c

spark_df = spark_df.withColumn("text", clean_text(col("text")))

spark_df.show(5)

+-------------------+--------------------+--------------+
|              index|                text|majority_label|
+-------------------+--------------------+--------------+
|1108866829991272448|@ finna fuck pont...|             0|
|1058874314303320064|t don mind me ’ i...|             1|
|1109486326477438976|a law played jude...|             0|
|1062399239337140224|review of heart b...|             0|
|1113926202006360064|nigga when the yo...|             0|
+-------------------+--------------------+--------------+
only showing top 5 rows



Den gesamten Datensatz oder einen Anteil des Datensatzes für das Training auswählen.

In [5]:
spark_df_sample = spark_df#.sample() #fraction=0.1
spark_df_sample.count()

1198584

# Feature Engineering
Für das Features Engineering wurden zwei Methoden für die Umwandlung der natürlichsprachlichen Texte in strukturierte Daten untersucht.
- Tokenization und Word2Vec (Embedding): Basierend auf den einzelnen Wörtern in einem Tweet, wird für jeden Satz ein Vektor mit 300 Dimensionen gebildet. 
- Tokenization und CountVectorizer: Nach der Aufteilung der Tweets in einzelne Wörter, werden die 10000 häufigsten Wörter in jedem Tweet gezählt. Es entsteht ein Vektor mit 10000 Dimensionen.

Die beste Leistung konnte mit dem CountVectorizer erzielt werden. 
Weitere Optionen, wie z.B. TF-IDF oder Embeddings von großen Sprachmodellen wurden nicht untersucht. 

## Tokenization und Word2Vec

In [6]:
# from pyspark.ml.feature import Word2Vec
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import Tokenizer
# from pyspark.ml.feature import StopWordsRemover

# # 'We hate religion' > 'We' 'hate' 'religion'
# tokenizer = Tokenizer(inputCol="text", outputCol="tokens")

# # 'We' > (0.000, 0.032432, ...) 300 Dimensionen
# w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="tokens", outputCol="features")

# doc2vec_pipeline = Pipeline(stages=[tokenizer, w2v])
# doc2vec_model = doc2vec_pipeline.fit(spark_df_sample)
# doc2vecs_df = doc2vec_model.transform(spark_df_sample)

# doc2vec_model.write().overwrite().save("../models/prep_tok2vec")

## Tokenization und CountVectorizer

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer


# 'We hate religion' > 'We' 'hate' 'religion'
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W")

# Remove stop words
stopwordsRemover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

# Term frequency
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

doc2tf_pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors])
doc2tf_model = doc2tf_pipeline.fit(spark_df_sample)
doc2tf_df = doc2tf_model.transform(spark_df_sample)

In [8]:
doc2tf_model.write().overwrite().save("../models/prep_tok2tf")

# Model Training
Für das Training wurden die klassischen ML-Modelle Logistic Regression und Naive Bayes untersucht. Für das weitere Vorgehen wurde das Naive Bayes-Modell ausgewählt.

In [9]:
# Data from Tok2Tf
hate_train_df, hate_test_df = doc2tf_df.randomSplit([0.8, 0.2])

In [10]:
print("Training Dataset Count: " + str(hate_train_df.count()))
times_hate = hate_train_df.filter(hate_train_df['majority_label'] > 0.0).count()
print(f'Times hate in training: {times_hate}')
times_not_hate = hate_train_df.filter(hate_train_df['majority_label'] == 0.0).count()
print(f'Times not hate in training: {times_not_hate}')

print("Test Dataset Count: " + str(hate_test_df.count()))
times_hate = hate_test_df.filter(hate_test_df['majority_label'] > 0.0).count()
print(f'Times hate in test: {times_hate}')
times_not_hate = hate_test_df.filter(hate_test_df['majority_label'] == 0.0).count()
print(f'Times not hate in test: {times_not_hate}')

Training Dataset Count: 958538
Times hate in training: 236156
Times not hate in training: 722382
Test Dataset Count: 240046
Times hate in test: 59612
Times not hate in test: 180434


## Logistic Regression

In [11]:
# from pyspark.ml.classification import LogisticRegression

# lr_classifier = LogisticRegression(family="multinomial", labelCol="majority_label", featuresCol="features")

# lr_classifier_pipeline = Pipeline(stages=[lr_classifier])
# lr_trained_pipeline = lr_classifier_pipeline.fit(hate_train_df)
# predictions = lr_trained_pipeline.transform(hate_test_df)

# lr_model_evaluator = MulticlassClassificationEvaluator(
#     labelCol="majority_label", predictionCol="prediction", metricName="accuracy")

# accuracy = lr_model_evaluator.evaluate(predictions)
# print("Accuracy = %g" % (accuracy))

# times_hate = predictions.filter(predictions['prediction'] == 1.0).count()
# print(f'Times hate detected: {times_hate}')
# times_not_hate = predictions.filter(predictions['prediction'] == 0.0).count()
# print(f'Times not hate detected: {times_not_hate}')

Save and Load Logistic Regression

In [12]:
# lr_trained_pipeline.write().overwrite().save("../models/model_lr")

## Naive Bayes

In [13]:
from pyspark.ml.classification import NaiveBayes
classifier = NaiveBayes(smoothing=1, labelCol="majority_label", featuresCol="features")

classifier_pipeline = Pipeline(stages=[classifier])
predictions = classifier_pipeline.fit(hate_train_df).transform(hate_test_df)

Evaluation

In [14]:
model_evaluator = MulticlassClassificationEvaluator(
    labelCol="majority_label", predictionCol="prediction", metricName="accuracy")

accuracy = model_evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))

times_hate = predictions.filter(predictions['prediction'] == 1.0).count()
print(f'Times hate detected: {times_hate}')
times_not_hate = predictions.filter(predictions['prediction'] == 0.0).count()
print(f'Times not hate detected: {times_not_hate}')

Accuracy = 0.765616
Times hate detected: 50313
Times not hate detected: 189733


Save and Load Naive Bayes

In [15]:
classifier_pipeline.write().overwrite().save("../models/model_nb")

# Benchmarking

Den gesamten Datensatz oder einen Anteil des Datensatzes für das Training auswählen. Je größer der Trainingsdatensatz, desto länger benötigt die MLlib Pipeline für die Ausführung.

Benchmark der Trainingszeiten MLlib:
| Anzahl Trainingsdatensätze | Mean | Standard Deviation |
| --- | --- | --- |
| 120461 | 3.30 | 0.42 |
|599259 | 8.98 | 0.63 |
| 1198584 | 17.4 | 1.03 |

Benchmark der Trainingszeiten Sklearn:
| Anzahl Trainingsdatensätze | Mean | Standard Deviation |
| --- | --- | --- |
| 120461 | 2.96 | 0.31 |
| 599259 | 11.88 | 0.44 |
| 1198584 | 22.56 | 0.43 |

### Durchführung Benchmarks: 
Local Spark Environment (3.3.1)
- 4 Cores, 8 Threats (1.8 Ghz Base Clock)
- 8 GB RAM (shared with OS)

Ergebnisse:
- In unserem Test liefert Spark MLLib bei kleinen Datenmengen minimal schlechtere Ergebnisse. Dies könnte z.B. durch Computational Overhead für die Spark Infrastruktur sein. Allerdings ist die Abweichung von durchschnittlich 0,3s sehr gering.
- In unserem Test skaliert die Spark MLLib Trainingspipeline bei größer werdenen Datenmengen besser. Dies könnte durch die verteilte Berechnung in der Spark Infrastruktur erreicht werden.  

Einschränkungen:
- Trotz größter Sorgfalt können beeinflussende Faktoren bei der manuellen Durchführung der Tests (z.B. durch Hintergrundprozsse) nicht ausgeschlossen werden. 
- Zudem sind die Ergebnisse stark abhängig von der jeweiligen Implementierung mit dem Framework. 
- Für das schnelle Prototyping wurde eine lokale Spark-Installation gewählt. Diese kann maximal über die Ressourcen des Host-PCs verfügen. Vorteile durch die vertikale Skalierung können somit nicht mit ausgewertet werden.
- Die festgestellten Abweichungen lassen somit keine signifikanten, gesicherten Aussagen über den Vergleich der Leistun der beiden Bibliotheken zu.

## MLlib implementation

In [16]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline

spark_df_sample = spark_df.sample(fraction=0.1) 
spark_df_sample.count()

119883

In [17]:
hate_train_df, hate_test_df = spark_df_sample.randomSplit([0.8, 0.2])

In [18]:
#120461: 3.7s, 3.7s, 3.1s, 2.7s, 3.3s
#599259: 9.2s, 8.9s, 8.7s, 8.2s, 9.9s
#1198584: 17.6s, 17.5s, 15.9s, 17.2s, 18.8s

In [19]:
# 'We hate religion' > 'We' 'hate' 'religion'
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W")

# Remove stop words
stopwordsRemover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

# Term frequency
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

# Classifier
classifier = NaiveBayes(smoothing=1, labelCol="majority_label", featuresCol="features")

In [20]:
inference_pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, classifier])
trained_inference_pipeline = inference_pipeline.fit(hate_train_df)
predictions = trained_inference_pipeline.transform(hate_test_df)

Save MLlib Pipeline

In [21]:
trained_inference_pipeline.write().overwrite().save("../models/mllib_model_nb")

In [22]:
model_evaluator = MulticlassClassificationEvaluator(
    labelCol="majority_label", predictionCol="prediction", metricName="accuracy")

accuracy = model_evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))

# times_hate = predictions.filter(predictions['prediction'] == 1.0).count()
# print(f'Times hate detected: {times_hate}')
# times_not_hate = predictions.filter(predictions['prediction'] == 0.0).count()
# print(f'Times not hate detected: {times_not_hate}')

Accuracy = 0.759202


## SKlearn implementation

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

pandas_df = spark_df.toPandas()
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1198584 entries, 0 to 1198583
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   index           1198584 non-null  int64 
 1   text            1198584 non-null  object
 2   majority_label  1198584 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 27.4+ MB


In [24]:
pandas_df_sample = pandas_df.sample(n=599259)
pandas_df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 599259 entries, 73519 to 1160324
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   index           599259 non-null  int64 
 1   text            599259 non-null  object
 2   majority_label  599259 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 18.3+ MB


In [25]:
X_train, X_test, y_train, y_test = train_test_split(pandas_df_sample['text'], pandas_df_sample['majority_label'], train_size = 0.8, test_size=0.2)

In [26]:
#120461: 2.9s, 2.8s, 2.9s, 3.5s, 2.7s
#599259: 11.4s, 11.8s, 11.7s, 11.9s, 12.6s
#1198584: 22.1s, 22.4s, 22.8s, 22.3s, 23.2s

In [27]:
countVec = CountVectorizer(stop_words='english', min_df=2, max_features=10000)
clf = MultinomialNB()

In [28]:
inference_pipeline = Pipeline([
    ('vect', countVec),
    ('clf', clf),
])

trained_inference_pipeline = inference_pipeline.fit(X_train, y_train)
y_pred = trained_inference_pipeline.predict(X_test)

Save Sklearn Pipeline

In [29]:
import joblib
joblib.dump(trained_inference_pipeline, '../models/sklearn_model_nb.pkl')
# pipeline = joblib.load('pipeline.pkl')

['../models/sklearn_model_nb.pkl']

In [30]:
accuracy_score(y_test, y_pred)

0.7683893468611287

In [31]:
from statistics import mean, stdev
small_mllib = [3.7, 3.7, 3.1, 2.7, 3.3]
print(f"Small MLLib mean: {mean(small_mllib)} stdev: {stdev(small_mllib)}")
medium_mllib = [9.2, 8.9, 8.7, 8.2, 9.9]
print(f"Medium MLLib mean: {mean(medium_mllib)} stdev: {stdev(medium_mllib)}")
complete_mllib = [17.6, 17.5, 15.9, 17.2, 18.8]
print(f"Complete MLLib mean: {mean(complete_mllib)} stdev: {stdev(complete_mllib)}")
small_sklearn = [2.9, 2.8, 2.9, 3.5, 2.7]
print(f"Small Sklearn mean: {mean(small_sklearn)} stdev: {stdev(small_sklearn)}")
medium_sklearn = [11.4, 11.8, 11.7, 11.9, 12.6]
print(f"Medium Sklearn mean: {mean(medium_sklearn)} stdev: {stdev(medium_sklearn)}")
complete_sklearn = [22.1, 22.4, 22.8, 22.3, 23.2]
print(f"Complete Sklearn mean: {mean(complete_sklearn)} stdev: {stdev(complete_sklearn)}")

Small MLLib mean: 3.3000000000000003 stdev: 0.4242640687119285
Medium MLLib mean: 8.98 stdev: 0.6300793600809349
Complete MLLib mean: 17.4 stdev: 1.0368220676663862
Small Sklearn mean: 2.96 stdev: 0.31304951684997057
Medium Sklearn mean: 11.88 stdev: 0.44384682042344276
Complete Sklearn mean: 22.56 stdev: 0.4393176527297754
