In [1]:
import os
import sys
import argparse
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [7]:
os.getcwd(), os.path.dirname('__file__')

('/home/adiazr/CODE/Probabilistic-Machine-Learning_lecture-PROJECTS/projects/05-1PLXXXX_political_color_posts/notebooks',
 '')

In [8]:

# df = pd.read_csv('final-features-rgb.csv')
# load the file from the directory ../data/, by using the os module to specify the path of the current file
df = pd.read_csv(os.path.join(os.path.dirname('__file__'), '../data/final-features-rgb.csv'))

X = df[[f'feature_{i}' for i in range(12)]]
y = df['party']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [12]:
df

Unnamed: 0,party,filename,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,date,feature_12,confidence,relevant
0,afd,2018-09-19_10-15-17_UTC.jpg,228,232,234,19,38,61,43,135,186,239,8,8,2018-09-19,1537315200,0.50,1
1,afd,2018-09-20_09-47-23_UTC.jpg,26,54,59,242,238,242,40,131,186,237,14,16,2018-09-20,1537401600,0.65,1
2,afd,2018-09-21_12-27-07_UTC.jpg,218,22,30,226,231,233,30,140,200,135,172,203,2018-09-21,1537488000,0.48,1
3,afd,2020-01-07_08-27-51_UTC.jpg,57,28,28,243,242,242,227,19,18,61,112,139,2020-01-07,1578355200,0.32,0
4,afd,2020-01-07_13-26-21_UTC.jpg,212,224,232,223,140,140,233,17,15,45,129,180,2020-01-07,1578355200,0.54,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28419,spd,2024-12-17_17-20-45_UTC_6.jpg,231,76,68,16,11,14,210,183,190,128,104,98,2024-12-17,1734393600,0.36,0
28420,spd,2024-12-18_10-28-01_UTC_1.jpg,227,1,15,254,252,252,3,2,2,211,130,135,2024-12-18,1734480000,0.48,1
28421,spd,2024-12-18_10-28-01_UTC_2.jpg,4,3,3,227,1,15,253,252,252,195,130,134,2024-12-18,1734480000,0.67,1
28422,spd,2024-12-18_10-28-01_UTC_3.jpg,4,4,4,227,1,15,254,253,253,202,134,138,2024-12-18,1734480000,0.72,1


In [13]:
# extract model metrics
def extract_metrics(report):
    metrics = {}
    lines = report.split("\n")
    for line in lines:
        if line.startswith(' ') and len(line.split()) > 1:
            parts = line.split()
            if len(parts) >= 5:
                label = parts[0]
                try:
                    precision = float(parts[1])
                    recall = float(parts[2])
                    f1_score = float(parts[3])
                    support = int(parts[4])
                    metrics[label] = {
                        'Precision': precision, # a more rigorous name for precision is positive predictive value
                        'Recall': recall, # a more rigorous name for recall is sensitivity
                        'F1-Score': f1_score, # a more rigorous name for F1-Score is balanced F-score
                        'Support': support # support is the number of true instances for each label
                    }
                except ValueError:
                    continue  
    return metrics

In [14]:
# 1. Random Forest 
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
report_rf = classification_report(y_test, y_pred_rf)
metrics_rf = extract_metrics(report_rf)
print("=== Random Forest ===")
print(report_rf)

=== Random Forest ===
              precision    recall  f1-score   support

         afd       0.74      0.45      0.56       411
         cdu       0.47      0.61      0.53      1138
         csu       0.54      0.69      0.60      1355
         fdp       0.88      0.79      0.83       841
      gruene       0.86      0.51      0.64       532
       linke       0.52      0.48      0.50       845
         spd       0.58      0.37      0.46       563

    accuracy                           0.59      5685
   macro avg       0.65      0.56      0.59      5685
weighted avg       0.62      0.59      0.59      5685



In [15]:
# 2. Support Vector Machine (SVM) 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
report_svm = classification_report(y_test, y_pred_svm)
metrics_svm = extract_metrics(report_svm)
print("=== Support Vector Machine (SVM) ===")
print(report_svm)

=== Support Vector Machine (SVM) ===
              precision    recall  f1-score   support

         afd       0.44      0.54      0.48       411
         cdu       0.44      0.63      0.52      1138
         csu       0.60      0.49      0.54      1355
         fdp       0.85      0.76      0.80       841
      gruene       0.64      0.59      0.62       532
       linke       0.54      0.27      0.36       845
         spd       0.36      0.49      0.41       563

    accuracy                           0.54      5685
   macro avg       0.55      0.54      0.53      5685
weighted avg       0.56      0.54      0.54      5685



In [None]:
# 3. MLP Classifier 
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    learning_rate_init=0.001,
    max_iter=1000,
    alpha=0.0001,
    random_state=42
)

mlp.fit(X_train_scaled, y_train)
y_pred_mlp = mlp.predict(X_test_scaled)
report_mlp = classification_report(y_test, y_pred_mlp)
metrics_mlp = extract_metrics(report_mlp)
print("=== MLP Classifier ===")
print(report_mlp)

=== MLP Classifier ===
              precision    recall  f1-score   support

         afd       0.49      0.37      0.42       411
         cdu       0.47      0.46      0.47      1138
         csu       0.49      0.52      0.51      1355
         fdp       0.72      0.70      0.71       841
     gruenen       0.46      0.53      0.49       532
       linke       0.39      0.35      0.37       845
         spd       0.41      0.44      0.43       563

    accuracy                           0.49      5685
   macro avg       0.49      0.48      0.48      5685
weighted avg       0.49      0.49      0.49      5685



In [None]:
metrics_all = {}

for label in metrics_rf:
    metrics_all[label] = {
        'Random Forest': metrics_rf[label]['F1-Score'],
        'SVM': metrics_svm[label]['F1-Score'],
        'MLP': metrics_mlp[label]['F1-Score']
    }

metrics_df = pd.DataFrame(metrics_all).T

metrics_df["Average"] = metrics_df.mean(axis=1)

average_row = metrics_df.mean(numeric_only=True)
average_row.name = "Average"
metrics_df = pd.concat([metrics_df, average_row.to_frame().T])

#Output
print("=== Modellvergleich ===")
print(metrics_df.round(2))

=== Modellvergleich ===
         Random Forest   SVM   MLP  Average
afd               0.47  0.44  0.42     0.44
cdu               0.54  0.47  0.47     0.49
csu               0.57  0.48  0.51     0.52
fdp               0.80  0.71  0.71     0.74
gruenen           0.61  0.50  0.49     0.53
linke             0.48  0.37  0.37     0.41
spd               0.46  0.44  0.43     0.44
Average           0.56  0.49  0.49     0.51
