# TFIDF

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np



In [2]:
df = pd.read_csv("./prepared_dataframe.csv")


In [3]:
# words in tweet as one string 
data = df
data["tweets_clean"] = [" ".join(eval(word)) for word in data["Wörter in Tweet"]] 

# drop unicode emojis and LBR
new_tweet_ls = []
for tweet in data.tweets_clean:
    tweet_sp = tweet.split()
    for word in tweet_sp:
        if any(x in word for x in ["000", "LBR", "lbr"]) :
            tweet = tweet.replace(word, "")
        
    new_tweet_ls.append(tweet)

data["tweets_clean"] = new_tweet_ls

## Data Pre

In [4]:
# preprocess the tweets
def process_tweets(tweet : str):
    # all lowercase
    tweet = tweet.lower()
    # remove Sonderzeichen etc (based on Sonderzeichen.txt)
    tweet = re.sub('[^a-zA-ZäöüÄÖÜß]', " ", tweet)
    return tweet


# split data in training and test
#data_train = data.sample(round(0.75 * len(data)), random_state = 1).reset_index()
#data_test = data[~data.index.isin(data_train.index)].reset_index()

data_pre = data
data_pre["tweets_clean"] = [process_tweets(tweet) for tweet in data_pre["tweet"]]

In [6]:
encoder = LabelEncoder()
data_pre['granulareKlassifikation'] = encoder.fit_transform(data_pre['granulareKlassifikation'])
encoder.classes_

array(['ABUSE', 'INSULT', 'OTHER', 'PROFANITY'], dtype=object)

## TFIDF Vectorizer

In [7]:
tfidf_vec = TfidfVectorizer()

In [8]:
df_train, df_test = train_test_split(data_pre,
 test_size = 0.2,
 random_state = 42)

X_train = tfidf_vec.fit_transform(df_train['tweets_clean'])
X_test = tfidf_vec.transform(df_test['tweets_clean'])
y_train = np.ravel(df_train['granulareKlassifikation'])
y_test = np.ravel(df_test['granulareKlassifikation'])


## Random Forest Classifier

In [22]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predictions = rfc.predict(X_test)
rfc_human_readeable = encoder.inverse_transform(rfc_predictions)

## Evaluation

In [20]:
print(classification_report(y_test, rfc_predictions, target_names=encoder.classes_))

              precision    recall  f1-score   support

       ABUSE       1.00      0.06      0.11        84
      INSULT       1.00      0.03      0.06       105
       OTHER       0.67      1.00      0.80       399
   PROFANITY       0.33      0.05      0.09        19

    accuracy                           0.67       607
   macro avg       0.75      0.29      0.27       607
weighted avg       0.76      0.67      0.56       607



In [23]:
acc = 0
for i in range(len(rfc_predictions)):
    if rfc_predictions[i] == y_test[i]:
        acc += 1
accuracy = acc / len(rfc_predictions)
print("Accuracy: " + str(accuracy))

Accuracy: 0.6771004942339374
