# Import Libraries

In [1]:
!pip install -r requirements.txt -q

In [2]:
# Datahandling
import requests
import os
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2

# Data

In [21]:
# Data import 
data = pd.read_csv('TwitterData_Joined.csv')

In [22]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
data.head(15)

First few rows of the dataset:


Unnamed: 0,Twitter_User_Name,Twitter_Account,Twitter_User_Description,Tweet_id,Tweet_created_at,Tweet_text,Label,Word_Count,Url_Count,Retweet,...,Adverb_Count,Positive_Word_Ratio,Negative_Word_Ratio,Neutral_Word_Ratio,Following,Followers,Verified,Link,Location,Real_Location
0,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,8.02758e+17,27-11-2016 06:15,Imperial Theatrical Coat for Court Lady https:...,0,8,2,0,...,0,0.0,0.0,1.0,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
1,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,8.74692e+17,13-06-2017 18:15,Half-length Figure of St Paul in an Oval. http...,0,10,2,0,...,0,0.0,0.0,1.0,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
2,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,6.9839e+17,13-02-2016 06:15,Great Exhibition Jurors&amp;#39; Medal https:/...,0,6,2,0,...,0,0.125,0.0,0.875,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
3,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,6.97665e+17,11-02-2016 06:15,Pair of candelabra https://t.co/KYopSWDSw2 htt...,0,5,2,0,...,0,0.0,0.0,1.0,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
4,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,6.21745e+17,16-07-2015 18:15,Banner (Nobori)\n http://t.co/yz34Xgo9a5 http:...,0,4,2,0,...,0,0.0,0.0,1.0,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
5,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,8.78391e+17,23-06-2017 23:15,A Small Piece of Turf https://t.co/Yucl869j6X ...,0,7,2,0,...,0,0.0,0.0,1.0,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
6,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,8.85201e+17,12-07-2017 18:15,Finger ring https://t.co/qAJhdi4VCG https://t....,0,4,2,0,...,0,0.0,0.0,1.0,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
7,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,6.61879e+17,04-11-2015 12:15,Ornaments https://t.co/WnUMY17C9w https://t.co...,0,3,2,0,...,0,0.0,0.0,1.0,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
8,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,5.57315e+17,19-01-2015 23:15,Lady Maria Conyngham (died 1843) http://t.co/I...,0,7,2,0,...,0,0.0,0.076923,0.923077,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0
9,Museum Bot,MuseumBot,I am a bot that tweets a random high-res Open ...,5.35572e+17,20-11-2014 23:15,Frame http://t.co/9hsuSqDCah http://t.co/JYWtW...,0,3,2,0,...,0,0.0,0.0,1.0,0,7816,0,https://twitter.com/MuseumBot?s=20,,-1.0


In [23]:
import pandas as pd
import re

# Antager at du allerede har importeret datasættet som 'data'
# Hvis datasættet er importeret som 'data', kan du fortsætte direkte fra her

# Funktion til at finde URL'er i tekst
# Opdateret regex for at finde mere generelle URL'er
def find_urls(text):
    # Regular expression til at finde URL'er, som kan inkludere både http(s) og uden http
    url_pattern = r'(\b(?:https?://|www\.)[a-zA-Z0-9-_.]+\.[a-zA-Z]{2,}\b(?:[^\s]*))'
    return re.findall(url_pattern, text)

# Find alle URL'er i Tweet-kolonnen
urls = data['Tweet_text'].apply(find_urls)

# Saml alle URL'er i en liste
all_urls = [url for sublist in urls for url in sublist]

# Fjern duplikater
unique_urls = list(set(all_urls))

# Udskriv listen med unikke URL'er
print(unique_urls)



['https://t.co/Seh6k5oUf0', 'http://t.co/vtOdpk0Wxh\\n#Thegigglinggene', 'http://t.co/wW7E3xrohJ', 'https://t.co/dLBdmfKMmp', 'https://t.co/UqhbsxwheZ', 'http://t.co/At1mfERRPo', 'http://t.co/xfrNZZKfiO', 'https://t.co/mVfDCdYOyR', 'https://t.co/AFuE0D86Pu', 'https://t.co/0FfIpeRF9Z', 'https://t.co/7QTDKEjVMx', 'http://t.co/meyOJtWQ', 'https://t.co/Of4ZyyEpHx', 'https://t.co/sVQW3RhZsf', 'https://t.co/Sm81x8WNKp', 'https://t.co/hNUUOOItxR', 'https://t.co/jKlBwCVET0', 'http://t.co/htnuwKJdP9', 'http://bit.ly/3KFbmK', 'https://t.co/iuITrvfc4a', 'http://t.co/es0zpaaHlQ', 'https://t.co/txTAevaams', 'https://t.co/7uy2sSqAUq', 'https://t.co/d0QOMYb3xT', 'https://t.co/8xkMvn4YWP', 'https://t.co/QowLEhXRCd', 'https://t.co/2NKiYtsgGp', 'https://t.co/moYDrPdBkj', 'https://t.co/V2BxXcqDhi', 'https://t.co/ZguYFdetwY', 'https://t.co/b0X7C3bJt5', 'http://t.co/7kdsV5egIy', 'https://t.co/waeNWlnwQY', 'https://t.co/knL5M94DWM', 'http://t.co/C7XGOd0gcI', 'https://t.co/FEQFnAQsQW', 'http://t.co/WAIARTKIi

In [25]:
data.Label.value_counts()

Label
1    144224
0    135467
Name: count, dtype: int64

In [24]:
# Display basic information about the dataset
print("\nBasic information about the dataset:")
print(data.info())


Basic information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279691 entries, 0 to 279690
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Twitter_User_Name         279691 non-null  object 
 1   Twitter_Account           279691 non-null  object 
 2   Twitter_User_Description  276027 non-null  object 
 3   Tweet_id                  279691 non-null  float64
 4   Tweet_created_at          279691 non-null  object 
 5   Tweet_text                279691 non-null  object 
 6   Label                     279691 non-null  int64  
 7   Word_Count                279691 non-null  int64  
 8   Url_Count                 279691 non-null  int64  
 9   Retweet                   279691 non-null  int64  
 10  Original_User             58391 non-null   object 
 11  Mentions_Count            279691 non-null  int64  
 12  Hashtags_Count            279691 non-null  int64  
 13  QuesMa

In [27]:
data = pd.read_csv('TwitterData_Joined.csv')
data = data.sample(n=10000, random_state=42)

In [28]:
# Kolonner, der droppes baseret på irrelevans eller sparsomme data
columns_to_drop = [
    "Twitter_User_Name",  # Brugernavn har sjældent prædiktiv værdi
    "Twitter_Account",    # Unik identifikator, der ikke bruges direkte
    "Original_User",      # Mange manglende værdier
    "Location",           # Ofte støjende og sparsomme data
    "Link",               # Ofte irrelevant tekstdata
    "Real_Location",      # Numerisk, men sandsynligvis ikke prædiktiv
    "Tweet_id",           # Unik identifikator
    "Tweet_created_at"    # Tid er allerede bearbejdet (år, måned, time)
]

# Fjern de uønskede kolonner
data = data.drop(columns=columns_to_drop)


In [None]:
# Biblioteker
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
import xgboost as xgb
import pandas as pd
import numpy as np

# Splitting features og target
X = data.drop(columns=["Label"])  # Features
y = data["Label"]  # Target

# Håndtering af manglende værdier
# "Tweet_text" bør ikke udfyldes som <missing>, da det altid har værdi
X.fillna({"Twitter_User_Description": "<missing>"}, inplace=True)  # Beskrivelsen kan være relevant
X.fillna(0, inplace=True)  # Udfyld resten af de numeriske værdier med 0

# Identificer kategoriske kolonner
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Skaler numeriske kolonner
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Kolonnetransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split datasæt
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialiser modeller
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss')
}

results = {}
trained_models = {}  # Dictionary to store the actual trained models

# Training & Hyperparameter tuning
for name, model in models.items():
    print(f"Training {name}...")

    # Transform features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_val_transformed = preprocessor.transform(X_val)

    # Hyperparameter tuning for Random Forest and XGBoost
    if name == "Random Forest":
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train_transformed, y_train)
        trained_models[name] = grid_search.best_estimator_
    elif name == "XGBoost":
        param_grid_xgb = {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'n_estimators': [50, 100, 200],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
        grid_search_xgb = GridSearchCV(model, param_grid_xgb, cv=5, scoring='accuracy')
        grid_search_xgb.fit(X_train_transformed, y_train)
        trained_models[name] = grid_search_xgb.best_estimator_
    else:
        model.fit(X_train_transformed, y_train)
        trained_models[name] = model

    # Store model evaluation metrics
    y_val_pred = trained_models[name].predict(X_val_transformed)
    y_val_proba = trained_models[name].predict_proba(X_val_transformed)[:, 1] if hasattr(trained_models[name], 'predict_proba') else y_val_pred

    results[name] = {
        "Accuracy": accuracy_score(y_val, y_val_pred),
        "Precision": precision_score(y_val, y_val_pred),
        "Recall": recall_score(y_val, y_val_pred),
        "F1 Score": f1_score(y_val, y_val_pred),
        "ROC AUC": roc_auc_score(y_val, y_val_proba)
    }

# Evaluér på test-sæt
best_model_name = max(results, key=lambda k: results[k]['Accuracy'])
best_model = trained_models[best_model_name]  # Retrieve the actual best model
X_test_transformed = preprocessor.transform(X_test)
y_test_pred = best_model.predict(X_test_transformed)
y_test_proba = best_model.predict_proba(X_test_transformed)[:, 1] if hasattr(best_model, 'predict_proba') else y_test_pred

print("\nFinal Evaluation on Test Set:")
print(f"Best Model: {best_model_name}")
print(classification_report(y_test, y_test_pred))
print(f"Test Set ROC AUC: {roc_auc_score(y_test, y_test_proba):.4f}")

# Print results for all models
for name, metrics in results.items():
    print(f"\n{name}")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")



Training Logistic Regression...
Training Decision Tree...
Training Random Forest...


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 27874 to 135776
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Twitter_User_Description  1974 non-null   object 
 1   Tweet_text                2000 non-null   object 
 2   Label                     2000 non-null   int64  
 3   Word_Count                2000 non-null   int64  
 4   Url_Count                 2000 non-null   int64  
 5   Retweet                   2000 non-null   int64  
 6   Mentions_Count            2000 non-null   int64  
 7   Hashtags_Count            2000 non-null   int64  
 8   QuesMark_Count            2000 non-null   int64  
 9   Exclamations_Count        2000 non-null   int64  
 10  SpecialCharacters_Count   2000 non-null   int64  
 11  Nouns_Count               2000 non-null   int64  
 12  Pronouns_Count            2000 non-null   int64  
 13  Verb_Count                2000 non-null   int64  
 14  Adverb_

In [None]:
data = data.sample(n=2000, random_state=42)

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
)
import pandas as pd
import numpy as np

# Data Preparation
X = data.drop(columns=['Bot Label'])  # Features
y = data['Bot Label']  # Target

# Håndtering af manglende værdier og tidsfunktioner
X['Hashtags'] = X['Hashtags'].fillna('<missing>')
X['Created At'] = pd.to_datetime(X['Created At'])
X['Year'] = X['Created At'].dt.year
X['Month'] = X['Created At'].dt.month
X['Hour'] = X['Created At'].dt.hour

# Drop User ID og Created At (efter at have udledt funktioner)
X = X.drop(columns=['User ID', 'Created At'])

# Identificer kategoriske kolonner
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Skaler numeriske kolonner
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Kolonnetransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split datasæt
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialiser modeller
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}
trained_models = {}  # Dictionary to store the actual trained models

# Training & Hyperparameter tuning
for name, model in models.items():
    print(f"Training {name}...")

    # Transform features
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_val_transformed = preprocessor.transform(X_val)

    # Hyperparameter tuning for Random Forest and XGBoost
    if name == "Random Forest":
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train_transformed, y_train)
        trained_models[name] = grid_search.best_estimator_
    elif name == "XGBoost":
        param_grid_xgb = {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'n_estimators': [50, 100, 200],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
        grid_search_xgb = GridSearchCV(model, param_grid_xgb, cv=5, scoring='accuracy')
        grid_search_xgb.fit(X_train_transformed, y_train)
        trained_models[name] = grid_search_xgb.best_estimator_
    else:
        model.fit(X_train_transformed, y_train)
        trained_models[name] = model

    # Store model evaluation metrics
    y_val_pred = trained_models[name].predict(X_val_transformed)
    y_val_proba = trained_models[name].predict_proba(X_val_transformed)[:, 1] if hasattr(trained_models[name], 'predict_proba') else y_val_pred

    results[name] = {
        "Accuracy": accuracy_score(y_val, y_val_pred),
        "Precision": precision_score(y_val, y_val_pred),
        "Recall": recall_score(y_val, y_val_pred),
        "F1 Score": f1_score(y_val, y_val_pred),
        "ROC AUC": roc_auc_score(y_val, y_val_proba)
    }

# Evaluér på test-sæt
best_model_name = max(results, key=lambda k: results[k]['Accuracy'])
best_model = trained_models[best_model_name]  # Retrieve the actual best model
X_test_transformed = preprocessor.transform(X_test)
y_test_pred = best_model.predict(X_test_transformed)
y_test_proba = best_model.predict_proba(X_test_transformed)[:, 1] if hasattr(best_model, 'predict_proba') else y_test_pred

print("\nFinal Evaluation on Test Set:")
print(f"Best Model: {best_model_name}")
print(classification_report(y_test, y_test_pred))
print(f"Test Set ROC AUC: {roc_auc_score(y_test, y_test_proba):.4f}")

# Print results for all models
for name, metrics in results.items():
    print(f"\n{name}")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")


KeyError: "['Bot Label'] not found in axis"

# GPU