In [1]:
# Datahandling
import os
import re
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.sparse
import matplotlib.pyplot as plt

# Maskinlæring
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, classification_report)
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold, mutual_info_classif
import xgboost as xgb

# Transformer-modeller
from transformers import (BertTokenizer, BertModel, 
                          DistilBertTokenizer, DistilBertModel)

# Andre biblioteker
import torch
import gradio as gr
import joblib


In [None]:
data = pd.read_csv('TwitterData_Joined.csv')
data = data.sample(n=1000, random_state=42)

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (but keep the text)
    text = re.sub(r'#', '', text)
    # Remove emojis and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Data Cleaning
data['Tweet_text'] = data['Tweet_text'].apply(clean_text)  # Custom cleaning function
data.dropna(subset=['Tweet_text'], inplace=True)

In [None]:
# Funktion til at generere BERT-embeddings
def generate_bert_embeddings(texts, tokenizer, model, max_len=128):
    """Genererer BERT-embeddings for en liste af tekster."""
    tokens = tokenizer(
        texts, max_length=max_len, truncation=True, padding='max_length', return_tensors="pt"
    )
    with torch.no_grad():
        outputs = model(**tokens)
    # CLS-tokenets embedding bruges som tekstrepræsentation
    cls_embeddings = outputs.last_hidden_state[:, 0, :]
    return cls_embeddings.numpy()

# Kolonner, der bruges til modeltræning
columns_to_keep = ['Label', 'Tweet_text', 'Followers', 'Following', 'Verified', 'Real_Location']
data = data[columns_to_keep]

# Splitting features og target
X = data.drop(columns=["Label"])  # Features
y = data["Label"]  # Target

# Initialiser tokenizer og BERT-model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Generér BERT-embeddings for Tweet_text
tweet_texts = X['Tweet_text'].fillna("").tolist()
bert_embeddings = generate_bert_embeddings(tweet_texts, tokenizer, bert_model)

# Vægt embeddings
bert_weight = 10.0
weighted_bert_embeddings = bert_embeddings * bert_weight

# Tilføj embeddings til features og fjern Tweet_text
X_bert = pd.DataFrame(weighted_bert_embeddings, index=X.index)
X = pd.concat([X.drop(columns=['Tweet_text']), X_bert], axis=1)

# Konverter alle kolonnenavne til strings
X.columns = X.columns.astype(str)

# Identificer kategoriske og numeriske kolonner
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Inkluder numeriske BERT-kolonner
numerical_cols.extend([str(col) for col in range(weighted_bert_embeddings.shape[1])])

# Preprocessor til numeriske og kategoriske data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split datasæt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest-model med hyperparameter-tuning
rf_model = RandomForestClassifier(class_weight='balanced')
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')

# Transformer træningsdata
X_train_transformed = preprocessor.fit_transform(X_train)
grid_search.fit(X_train_transformed, y_train)

# Evaluér på test-sæt
X_test_transformed = preprocessor.transform(X_test)
y_test_pred = grid_search.best_estimator_.predict(X_test_transformed)
y_test_proba = grid_search.best_estimator_.predict_proba(X_test_transformed)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_test_proba):.4f}")

# Gem model og preprocessor
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(grid_search.best_estimator_, "best_model.pkl")
print("Model og preprocessor gemt!")


Distillbert

In [None]:
# Funktion til at generere BERT-embeddings
def generate_bert_embeddings(texts, tokenizer, model, max_len=128):
    """Genererer BERT-embeddings for en liste af tekster."""
    tokens = tokenizer(
        texts, max_length=max_len, truncation=True, padding='max_length', return_tensors="pt"
    )
    with torch.no_grad():
        outputs = model(**tokens)
    # CLS-tokenets embedding bruges som tekstrepræsentation
    cls_embeddings = outputs.last_hidden_state[:, 0, :]
    return cls_embeddings.numpy()

# Antager at 'data' allerede er indlæst
# Kolonner, der bruges til modeltræning
columns_to_keep = ['Label', 'Tweet_text', 'Followers', 'Following', 'Verified', 'Real_Location']
data = data[columns_to_keep]

# Splitting features og target
X = data.drop(columns=["Label"])  # Features
y = data["Label"]  # Target

# Initialiser tokenizer og DistilBERT-model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Generér BERT-embeddings for `Tweet_text`
tweet_texts = X['Tweet_text'].fillna("").tolist()
bert_embeddings = generate_bert_embeddings(tweet_texts, tokenizer, bert_model)

# Vægt embeddings
bert_weight = 10.0
weighted_bert_embeddings = bert_embeddings * bert_weight

# Tilføj embeddings til features og fjern `Tweet_text`
X_bert = pd.DataFrame(weighted_bert_embeddings, index=X.index)
X = pd.concat([X.drop(columns=['Tweet_text']), X_bert], axis=1)

# Konverter alle kolonnenavne til strings
X.columns = X.columns.astype(str)

# Identificer kategoriske og numeriske kolonner
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Inkluder numeriske BERT-kolonner
numerical_cols.extend([str(col) for col in range(weighted_bert_embeddings.shape[1])])

# Preprocessor til numeriske og kategoriske data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split datasæt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest-model med hyperparameter-tuning
rf_model = RandomForestClassifier(class_weight='balanced')
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')

# Transformer træningsdata
X_train_transformed = preprocessor.fit_transform(X_train)
grid_search.fit(X_train_transformed, y_train)

# Evaluér på test-sæt
X_test_transformed = preprocessor.transform(X_test)
y_test_pred = grid_search.best_estimator_.predict(X_test_transformed)
y_test_proba = grid_search.best_estimator_.predict_proba(X_test_transformed)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_test_proba):.4f}")

# Gem model og preprocessor
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(grid_search.best_estimator_, "best_model.pkl")
print("Model og preprocessor gemt!")

In [None]:
# Feature importance
importances = grid_search.best_estimator_.feature_importances_
feature_names = preprocessor.get_feature_names_out()

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(importance_df.sort_values(by="Importance", ascending=False))


In [2]:
# Efter generering af BERT embeddings
print("Eksempel på BERT embedding:", weighted_bert_embeddings[0][:10])  # Tjek værdier
print("Shape of BERT embeddings:", weighted_bert_embeddings.shape)

# Tjek sammenfletning af features
print("Shape før tilføjelse af BERT:", X.drop(columns=['Tweet_text']).shape)
print("Shape efter tilføjelse af BERT:", X.shape)  # Skal være flere kolonner end før


NameError: name 'weighted_bert_embeddings' is not defined