## Import Libraries

In [1]:
!pip install -r requirements.txt -q

In [None]:
# Datahandling
import requests
import os
import numpy as np
import pandas as pd
import seaborn as sns

# Data Scraper
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import json
import time
import nest_asyncio
import asyncio
import json
from twikit.guest import GuestClient
import logging
from datetime import datetime
from IPython.display import clear_output

#Setfit:
from setfit import SetFitModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

### Data Scraping Tweets from X/Twitter Based on Profiles (Pre-labeled as Bot or Human from the Twibot-22 Dataset,) 
#### Fetching data using Guest token 🛩️

In [4]:
# initializes asyncio to run in a jupyter or interactive session
nest_asyncio.apply()

# Configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)

# semaphore to limit concurrent tasks
semaphore = asyncio.Semaphore(32)

async def fetch_user_tweets(client, user_id, label):
    """
    Fetch tweets for a specific user.
    """
    try:
        tweets = []
        results = await client.get_user_tweets(
            user_id=user_id,
            tweet_type='Tweets',
            count=20  # Number of tweets to fetch per user
        )
        tweets.extend(results)

        # Extract relevant data
        tweet_data = [
            {
                'User_id': tweet.user.id,
                'Tweet_id': tweet.id,
                'Tweet_text': tweet.text,
                'Language': tweet.lang,
                'Label': label
            }
            for tweet in tweets
        ]

        return tweet_data
    except Exception as e:
        return []

async def main(random_state):
    try:
        # Initialize a new client for each iteration
        client = GuestClient()
        await client.activate()
        logger.info(f"Client initialized for random_state={random_state}.")

        # load dataset and take a sample of 50 user IDs
        data = pd.read_csv('userid_labels-(twi-bot_22).csv')
        
        # Filter the data into 'Bot' and 'Human' groups
        bot_data = data[data['label'] == 'bot']
        human_data = data[data['label'] == 'human']

        # Sample 25 rows from each group (50% Bot and 50% Human)
        bot_sample = bot_data.sample(n=25, random_state=random_state)
        human_sample = human_data.sample(n=25, random_state=random_state)

        # Combine the samples
        sample_data = pd.concat([bot_sample, human_sample])
        
        # Ensure that the 'id' column does not have 'u' prefix
        sample_data['id'] = sample_data['id'].str.lstrip('u')
        
        all_tweets = []

        # fetch tweets for each user in the sample
        for _, row in sample_data.iterrows():
            user_id = row['id']
            label = row['label']
            logger.info(f"fetching tweets for user-id: {user_id} (Label: {label})") # Fills your output with info
            # Limit concurrent tasks with the semaphore
            async with semaphore:
                tweets = await fetch_user_tweets(client, user_id, label)
                all_tweets.extend(tweets)

        return pd.DataFrame(all_tweets)

    except Exception as e:
        logger.error(f"General error in random_state={random_state}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

async def run_parallel():
    """
    Run main() in parallel with different random_state values.
    """
    random_states = range(0, 64)
    tasks = [main(random_state) for random_state in random_states]
    
    # Run the tasks in parallel, but limited to a maximum of 16 concurrent
    results = await asyncio.gather(*tasks)
    
    # Combine all results into one DataFrame
    combined_df = pd.concat(results, ignore_index=True)
    
    # remove duplicates based on 'Tweet_id'
    combined_df = combined_df.drop_duplicates(subset=['Tweet_id'])
    
    # English language only
    combined_df = combined_df[combined_df['Language'] == 'en']
    
    # Save combined file
    clear_output(wait=True)
    output_file = 'scraped_validation_human_bot_twitter_dataset.csv'
    combined_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Total tweets fetched: {len(combined_df)}")
    print(f"All tweets saved in {output_file}")

    return combined_df

# Run in parallel
if __name__ == "__main__":
    asyncio.run(run_parallel())


Total tweets fetched: 26030
All tweets saved in scraped_validation_human_bot_twitter_dataset.csv


In [None]:
# Calculate unique User_id for 'bot' and 'human'
unique_bots = test_df[test_df['Label'] == 'bot']['User_id'].nunique()
unique_humans = test_df[test_df['Label'] == 'human']['User_id'].nunique()

# Calculate total unique User_id
total_unique_users = test_df['User_id'].nunique()

# Calculate the total number of rows
total_rows = test_df.shape[0]

# Print the results
print(f"Number of unique users with label 'bot': {unique_bots}")
print(f"Number of unique users with label 'human': {unique_humans}")
print(f"Total number of unique users: {total_unique_users}")
print(f"Total number of rows: {total_rows}")


Number of unique users with label 'bot': 118
Number of unique users with label 'human': 573
Total number of unique users: 691
Total number of rows: 49983


### Import fetched human/bot dataset

In [None]:
test_df = pd.read_csv("scraped_validation_human_bot_twitter_dataset.csv")
test_df['Label'] = test_df['Label'].map({'bot': 0, 'human': 1})

### SetFit - Second Dataset Validation 🔬

In [None]:
logging.getLogger("setfit").setLevel(logging.WARNING)

# load saved setfit model
loaded_model = SetFitModel.from_pretrained("setfit_model")
print("Model indlæst.")

# prepare data
X_test = test_df['Tweet_text'].tolist()  # Tekstdata
y_true = test_df['Label'].astype(int).tolist()  # True labels (konverteret til int)

# create predictions
print("Laver predictions...")
y_pred = loaded_model.predict(X_test)

# calculate metrics
print("Beregner metrics...")
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

# show results..
print("\nValideringsresultater:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print("\nDetaljeret rapport:")
print(classification_report(y_true, y_pred, target_names=["Bot", "Human"]))


2024-12-03 20:33:22,174 - INFO: Use pytorch device_name: mps
2024-12-03 20:33:22,175 - INFO: Load pretrained SentenceTransformer: setfit_model


Model indlæst.
Laver predictions...


Batches:   0%|          | 0/1432 [00:00<?, ?it/s]

Beregner metrics...

Valideringsresultater:
Accuracy: 0.73
Precision: 0.85
Recall: 0.83
F1-score: 0.84

Detaljeret rapport:
              precision    recall  f1-score   support

         Bot       0.21      0.24      0.22      7216
       Human       0.85      0.83      0.84     38587

    accuracy                           0.73     45803
   macro avg       0.53      0.53      0.53     45803
weighted avg       0.75      0.73      0.74     45803

