# Import Libraries

In [1]:
%pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Datahandling
import requests
import os
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2

# Data

In [25]:
# Data import 
data = pd.read_csv('detection_data.csv')

In [4]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
   User ID        Username                                              Tweet  \
0   132131           flong  Station activity person against natural majori...   
1   289683  hinesstephanie  Authority research natural life material staff...   
2   779715      roberttran  Manage whose quickly especially foot none to g...   
3   696168          pmason  Just cover eight opportunity strong policy which.   
4   704441          noah87                      Animal sign six data good or.   

   Retweet Count  Mention Count  Follower Count  Verified  Bot Label  \
0             85              1            2353     False          1   
1             55              5            9617      True          0   
2              6              2            4363      True          0   
3             54              5            2242      True          1   
4             26              3            8438     False          1   

       Location           Created At            H

In [6]:
# Display basic information about the dataset
print("\nBasic information about the dataset:")
print(data.info())


Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   User ID         50000 non-null  int64 
 1   Username        50000 non-null  object
 2   Tweet           50000 non-null  object
 3   Retweet Count   50000 non-null  int64 
 4   Mention Count   50000 non-null  int64 
 5   Follower Count  50000 non-null  int64 
 6   Verified        50000 non-null  bool  
 7   Bot Label       50000 non-null  int64 
 8   Location        50000 non-null  object
 9   Created At      50000 non-null  object
 10  Hashtags        50000 non-null  object
dtypes: bool(1), int64(5), object(5)
memory usage: 3.9+ MB
None


In [5]:
data['Hashtags'] = data['Hashtags'].fillna('<missing>')


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
import xgboost as xgb
import pandas as pd
import numpy as np

# Data Preparation
# Antag at "data" er din dataframe
X = data.drop(columns=['Bot Label'])  # Features
y = data['Bot Label']  # Target

# Håndtering af manglende værdier og tidsfunktioner
X['Hashtags'] = X['Hashtags'].fillna('<missing>')
X['Created At'] = pd.to_datetime(X['Created At'])
X['Year'] = X['Created At'].dt.year
X['Month'] = X['Created At'].dt.month
X['Hour'] = X['Created At'].dt.hour

# Drop User ID og Created At (efter at have udledt funktioner)
X = X.drop(columns=['User ID', 'Created At'])

# Identificer kategoriske og numeriske kolonner
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Skaler numeriske kolonner
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Kolonnetransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split datasæt
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Feature selection og modeltræning
def select_features(X_train, y_train, X_val, k=10):
    # Transformér data
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_val_transformed = preprocessor.transform(X_val)
    
    # Brug f_classif i stedet for chi2 for at undgå problemer med negative værdier
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train_transformed, y_train)
    X_val_selected = selector.transform(X_val_transformed)
    return X_train_selected, X_val_selected, selector

# Initialiser modeller inkl. XGBoost
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": xgb.XGBClassifier(eval_metric="logloss", use_label_encoder=False)
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    X_train_selected, X_val_selected, selector = select_features(X_train, y_train, X_val, k=10)
    
    # Hyperparameter tuning for Random Forest og XGBoost
    if name == "Random Forest":
        param_grid = {
            'n_estimators': [50, 100],
            'max_depth': [None, 10, 20]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train_selected, y_train)
        model = grid_search.best_estimator_
    elif name == "XGBoost":
        param_grid = {
            'n_estimators': [50, 100],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train_selected, y_train)
        model = grid_search.best_estimator_
    else:
        model.fit(X_train_selected, y_train)

    y_val_pred = model.predict(X_val_selected)
    y_val_proba = model.predict_proba(X_val_selected)[:, 1] if hasattr(model, 'predict_proba') else y_val_pred

    results[name] = {
        "Accuracy": accuracy_score(y_val, y_val_pred),
        "Precision": precision_score(y_val, y_val_pred),
        "Recall": recall_score(y_val, y_val_pred),
        "F1 Score": f1_score(y_val, y_val_pred),
        "ROC AUC": roc_auc_score(y_val, y_val_proba)
    }

# Evaluér på test-sæt
best_model_name = max(results, key=lambda k: results[k]['Accuracy'])
best_model = models[best_model_name]

X_test_transformed = preprocessor.transform(X_test)
X_test_selected = selector.transform(X_test_transformed)
y_test_pred = best_model.predict(X_test_selected)
y_test_proba = best_model.predict_proba(X_test_selected)[:, 1] if hasattr(best_model, 'predict_proba') else y_test_pred

# Udskriv resultater
print("\nFinal Evaluation on Test Set:")
print(f"Best Model: {best_model_name}")
print(classification_report(y_test, y_test_pred))
print(f"Test Set ROC AUC: {roc_auc_score(y_test, y_test_proba):.4f}")


Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training Gradient Boosting...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


Final Evaluation on Test Set:
Best Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.49      1.00      0.66      3702
           1       0.71      0.00      0.00      3798

    accuracy                           0.49      7500
   macro avg       0.60      0.50      0.33      7500
weighted avg       0.61      0.49      0.33      7500

Test Set ROC AUC: 0.5001


Parameters: { "use_label_encoder" } are not used.



In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   User ID         50000 non-null  int64 
 1   Username        50000 non-null  object
 2   Tweet           50000 non-null  object
 3   Retweet Count   50000 non-null  int64 
 4   Mention Count   50000 non-null  int64 
 5   Follower Count  50000 non-null  int64 
 6   Verified        50000 non-null  bool  
 7   Bot Label       50000 non-null  int64 
 8   Location        50000 non-null  object
 9   Created At      50000 non-null  object
 10  Hashtags        50000 non-null  object
dtypes: bool(1), int64(5), object(5)
memory usage: 3.9+ MB


In [26]:
data = data.sample(n=5000, random_state=42)

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
import pandas as pd
import numpy as np

# Data Preparation
X = data.drop(columns=['Bot Label'])  # Features
y = data['Bot Label']  # Target

# Håndtering af manglende værdier og tidsfunktioner
X['Hashtags'] = X['Hashtags'].fillna('<missing>')
X['Created At'] = pd.to_datetime(X['Created At'])
X['Year'] = X['Created At'].dt.year
X['Month'] = X['Created At'].dt.month
X['Hour'] = X['Created At'].dt.hour

# Drop User ID og Created At (efter at have udledt funktioner)
X = X.drop(columns=['User ID', 'Created At'])

# Identificer kategoriske kolonner
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Skaler numeriske kolonner
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Kolonnetransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split datasæt
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialiser modeller
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}
trained_models = {}  # Dictionary to store the actual trained models

# Training & Hyperparameter tuning
for name, model in models.items():
    print(f"Training {name}...")

    # Transform features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_val_transformed = preprocessor.transform(X_val)

    # Hyperparameter tuning for Random Forest and XGBoost
    if name == "Random Forest":
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train_transformed, y_train)
        trained_models[name] = grid_search.best_estimator_
    elif name == "XGBoost":
        param_grid_xgb = {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'n_estimators': [50, 100, 200]
        }
        grid_search_xgb = GridSearchCV(model, param_grid_xgb, cv=5, scoring='accuracy')
        grid_search_xgb.fit(X_train_transformed, y_train)
        trained_models[name] = grid_search_xgb.best_estimator_
    else:
        model.fit(X_train_transformed, y_train)
        trained_models[name] = model

    # Store model evaluation metrics
    y_val_pred = trained_models[name].predict(X_val_transformed)
    y_val_proba = trained_models[name].predict_proba(X_val_transformed)[:, 1] if hasattr(trained_models[name], 'predict_proba') else y_val_pred

    results[name] = {
        "Accuracy": accuracy_score(y_val, y_val_pred),
        "Precision": precision_score(y_val, y_val_pred),
        "Recall": recall_score(y_val, y_val_pred),
        "F1 Score": f1_score(y_val, y_val_pred),
        "ROC AUC": roc_auc_score(y_val, y_val_proba)
    }

# Evaluér på test-sæt
best_model_name = max(results, key=lambda k: results[k]['Accuracy'])
best_model = trained_models[best_model_name]  # Retrieve the actual best model
X_test_transformed = preprocessor.transform(X_test)
y_test_pred = best_model.predict(X_test_transformed)
y_test_proba = best_model.predict_proba(X_test_transformed)[:, 1] if hasattr(best_model, 'predict_proba') else y_test_pred

print("\nFinal Evaluation on Test Set:")
print(f"Best Model: {best_model_name}")
print(classification_report(y_test, y_test_pred))
print(f"Test Set ROC AUC: {roc_auc_score(y_test, y_test_proba):.4f}")

# Print results for all models
for name, metrics in results.items():
    print(f"\n{name}")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")


Training Logistic Regression...
Training Decision Tree...
Training Random Forest...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


Final Evaluation on Test Set:
Best Model: Random Forest
              precision    recall  f1-score   support

           0       0.62      0.03      0.05       391
           1       0.48      0.98      0.65       359

    accuracy                           0.48       750
   macro avg       0.55      0.50      0.35       750
weighted avg       0.56      0.48      0.33       750

Test Set ROC AUC: 0.5240

Logistic Regression
Accuracy: 0.4907
Precision: 0.4857
Recall: 0.5514
F1 Score: 0.5165
ROC AUC: 0.4913

Decision Tree
Accuracy: 0.4907
Precision: 0.4837
Recall: 0.4811
F1 Score: 0.4824
ROC AUC: 0.4905

Random Forest
Accuracy: 0.4973
Precision: 0.4952
Recall: 0.9838
F1 Score: 0.6588
ROC AUC: 0.5007

XGBoost
Accuracy: 0.4813
Precision: 0.4836
Recall: 0.7568
F1 Score: 0.5901
ROC AUC: 0.4992


Parameters: { "use_label_encoder" } are not used.



# GPU

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
import pandas as pd
import numpy as np

# Data Preparation
X = data.drop(columns=['Bot Label'])  # Features
y = data['Bot Label']  # Target

# Håndtering af manglende værdier og tidsfunktioner
X['Hashtags'] = X['Hashtags'].fillna('<missing>')
X['Created At'] = pd.to_datetime(X['Created At'])
X['Year'] = X['Created At'].dt.year
X['Month'] = X['Created At'].dt.month
X['Hour'] = X['Created At'].dt.hour

# Drop User ID og Created At (efter at have udledt funktioner)
X = X.drop(columns=['User ID', 'Created At'])

# Identificer kategoriske kolonner
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Skaler numeriske kolonner
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Kolonnetransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split datasæt
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialiser XGBoost model
model = xgb.XGBClassifier(
    use_label_encoder=False, 
    eval_metric='logloss', 
    tree_method='gpu_hist',  # This ensures GPU usage
    gpu_id=0,                # Specify the GPU device (if you have more than one, adjust accordingly)
    predictor='gpu_predictor'  # Ensure predictions also use GPU
)

# Hyperparameter tuning for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]
}

# Transform training data
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Convert sparse matrix to dense format (XGBoost requires dense format for GPU)
X_train_transformed = X_train_transformed.toarray()
X_val_transformed = X_val_transformed.toarray()

# GridSearchCV for hyperparameter tuning
grid_search_xgb = GridSearchCV(model, param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train_transformed, y_train)

# Store best model
trained_model = grid_search_xgb.best_estimator_

# Train and evaluate the model
trained_model.fit(X_train_transformed, y_train)

# Evaluate model performance on validation set
y_val_pred = trained_model.predict(X_val_transformed)
y_val_proba = trained_model.predict_proba(X_val_transformed)[:, 1] if hasattr(trained_model, 'predict_proba') else y_val_pred

# Store model evaluation metrics
results = {
    "Accuracy": accuracy_score(y_val, y_val_pred),
    "Precision": precision_score(y_val, y_val_pred),
    "Recall": recall_score(y_val, y_val_pred),
    "F1 Score": f1_score(y_val, y_val_pred),
    "ROC AUC": roc_auc_score(y_val, y_val_proba)
}

# Final evaluation on test set
X_test_transformed = preprocessor.transform(X_test)
X_test_transformed = X_test_transformed.toarray()  # Convert test set to dense

y_test_pred = trained_model.predict(X_test_transformed)
y_test_proba = trained_model.predict_proba(X_test_transformed)[:, 1] if hasattr(trained_model, 'predict_proba') else y_test_pred

print("\nFinal Evaluation on Test Set:")
print(f"Best Model: XGBoost")
print(classification_report(y_test, y_test_pred))
print(f"Test Set ROC AUC: {roc_auc_score(y_test, y_test_proba):.4f}")

# Print results
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x16863b380>>
Traceback (most recent call last):
  File "/Users/hedegaard/Documents/M3-TWITTERPROJECT/DDBMS-Submission/.conda/lib/python3.12/site-packages/xgboost/core.py", line 582, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 


: 