In [6]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from joblib import dump
import numpy as np
from sklearn.preprocessing import LabelEncoder

# read in data
data = pd.read_csv("/Users/jannis/ASUD_Cyberbullying /Notebooks/data/prepared_dataframe.csv")

# split data in training and test
data_train = data.sample(round(0.75 * len(data)), random_state = 1).reset_index()
data_test = data[~data.index.isin(data_train.index)].reset_index()



In [2]:
# load pretrained transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# create tweet embeddings
tweet_vectors = model.encode(data_train["tweets_clean"])
new_tweet_v = model.encode(data_test["tweets_clean"])



In [11]:
def encode_classes(classes: list) -> np.array:
    encoder = LabelEncoder()
    classes_trans = encoder.fit_transform(classes)
    return classes_trans, encoder

# decode classes
def decode_classes(preds: list, encoder: LabelEncoder) -> list:
    encoder_dict = dict(enumerate(encoder.classes_.flatten(), 0))
    preds = [encoder_dict[x] for x in preds]
    return preds

def calc_accuracy(preds: list, act_values: list) -> str:
    acc = 0
    for i in range(len(preds)):
        if preds[i] == act_values[i]:
            acc += 1
    accuracy = acc / len(preds)
    return f'Accuracy: {str(accuracy)}'

In [13]:
# Encode the classes
y_transformed, encoder = encode_classes(data_train["granulareKlassifikation"])

# Train a Random Forest
rfc = RandomForestClassifier(random_state=42)
# hyperparameter = {"max_depth": [4, 6, 8, 10, 12],
#                  "min_samples_split": [2, 3, 4, 5, 6]}
# ,
#                   "max_samples": [0.6, 0.7, 0.8, 0.9, 1]

# rfc_grid = GridSearchCV(rfc, hyperparameter, cv=5)
rfc.fit(tweet_vectors, y_transformed)

# make predictions
rfc_predictions = rfc.predict(new_tweet_v)

# Decode classes
rfc_predictions = decode_classes(rfc_predictions, encoder)

# calculate accuracy
acc = calc_accuracy(rfc_predictions, data_test["granulareKlassifikation"])
# print(acc)
# classification report
print(classification_report(data_test["granulareKlassifikation"], rfc_predictions))

# compare value counts
# print(RF_functions.compare_value_counts(rfc_predictions, data_test["granulareKlassifikation"]))

              precision    recall  f1-score   support

       ABUSE       1.00      0.80      0.89        86
      INSULT       1.00      0.77      0.87       171
       OTHER       0.88      1.00      0.94       474
   PROFANITY       1.00      0.74      0.85        27

    accuracy                           0.92       758
   macro avg       0.97      0.83      0.89       758
weighted avg       0.93      0.92      0.91       758



In [10]:
print(classification_report(data_test["granulareKlassifikation"], rfc_predictions))

              precision    recall  f1-score   support

       ABUSE       1.00      0.80      0.89        86
      INSULT       1.00      0.77      0.87       171
       OTHER       0.88      1.00      0.94       474
   PROFANITY       1.00      0.74      0.85        27

    accuracy                           0.92       758
   macro avg       0.97      0.83      0.89       758
weighted avg       0.93      0.92      0.91       758



In [14]:
print(acc)

Accuracy: 0.91688654353562
