# SetFit Model Validation Notebook 🧪🤖

In this notebook, we will explore the validation process of our model designed to classify tweets as either bot-generated or human-generated. 🐦✨

### Why Validation? 🔍
Validation of our SetFit model is crucial to assess its performance against real-world data. By comparing our model's predictions with the TwiBot-22 benchmark dataset, we ensure that our insights are accurate and relevant for informed business decision-making. 📊💼

### Dataset Overview 📚
We utilized the TwiBot-22 dataset, which contains labeled profiles of X (formerly Twitter) users, to validate our model's accuracy. This dataset is recognized for its high labeling accuracy and provides a solid foundation for our analysis.

 - Feng, S., Tan, Z., Wan, H., Wang, N., Chen, Z., Zhang, B., Zheng, Q., Zhang, W., Lei, Z., Yang, S., Feng, X., Zhang, Q., Wang, H., Liu, Y., Bai, Y., Wang, H., Cai, Z., Wang, Y., Zheng, L., Ma, Z., Li, J., & Luo, M. (2023). TwiBot-22: Towards graph-based Twitter bot detection. arXiv. https://doi.org/10.48550/arXiv.2206.04564

### Methodology 🛠️
Using the TwiKit Python library, we efficiently collected tweets from the TwiBot-22 dataset, employing advanced techniques to optimize our data retrieval process. Our approach ensures a balanced dataset for validation, allowing us to draw meaningful conclusions about our model's performance. 🚀


### Import Libraries 🔌

In [23]:
!pip install -r requirements.txt -q

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
# Datahandling
import requests
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import contractions

# Data Scraper
import json
import time
import nest_asyncio
import asyncio
from twikit.guest import GuestClient
import logging
from datetime import datetime
from IPython.display import clear_output

#Setfit:
from setfit import SetFitModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

### Data Scraping Tweets from X/Twitter Based on Profiles (Pre-labeled as Bot or Human from the Twibot-22 Dataset,) 
#### Fetching data using Guest token 🛩️

In [None]:
# initializes asyncio to run in a jupyter or interactive session
nest_asyncio.apply()

# Configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("httpx").setLevel(logging.WARNING)

# semaphore to limit concurrent tasks
semaphore = asyncio.Semaphore(32)

async def fetch_user_tweets(client, user_id, label):
    """
    Fetch tweets for a specific user.
    """
    try:
        tweets = []
        results = await client.get_user_tweets(
            user_id=user_id,
            tweet_type='Tweets',
            count=20  # Number of tweets to fetch per user
        )
        tweets.extend(results)

        # Extract relevant data
        tweet_data = [
            {
                'User_id': tweet.user.id,
                'Tweet_id': tweet.id,
                'Tweet_text': tweet.text,
                'Language': tweet.lang,
                'Label': label
            }
            for tweet in tweets
        ]

        return tweet_data
    except Exception as e:
        return []

async def main(random_state):
    try:
        # Initialize a new client for each iteration
        client = GuestClient()
        await client.activate()
        logger.info(f"Client initialized for random_state={random_state}.")

        # load dataset and take a sample of 50 user IDs
        data = pd.read_csv('userid_labels-(twi-bot_22).csv')
        
        # Filter the data into 'Bot' and 'Human' groups
        bot_data = data[data['label'] == 'bot']
        human_data = data[data['label'] == 'human']

        # Sample 25 rows from each group (50% Bot and 50% Human)
        bot_sample = bot_data.sample(n=25, random_state=random_state)
        human_sample = human_data.sample(n=25, random_state=random_state)

        # Combine the samples
        sample_data = pd.concat([bot_sample, human_sample])
        
        # Ensure that the 'id' column does not have 'u' prefix
        sample_data['id'] = sample_data['id'].str.lstrip('u')
        
        all_tweets = []

        # fetch tweets for each user in the sample
        for _, row in sample_data.iterrows():
            user_id = row['id']
            label = row['label']
            logger.info(f"fetching tweets for user-id: {user_id} (Label: {label})") # Fills your output with info
            # Limit concurrent tasks with the semaphore
            async with semaphore:
                tweets = await fetch_user_tweets(client, user_id, label)
                all_tweets.extend(tweets)

        return pd.DataFrame(all_tweets)

    except Exception as e:
        logger.error(f"General error in random_state={random_state}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

async def run_parallel():
    """
    Run main() in parallel with different random_state values.
    """
    random_states = range(0, 64)
    tasks = [main(random_state) for random_state in random_states]
    
    # Run the tasks in parallel, but limited to a maximum of 16 concurrent
    results = await asyncio.gather(*tasks)
    
    # Combine all results into one DataFrame
    test_df = pd.concat(results, ignore_index=True)
    
    # remove duplicates based on 'Tweet_id'
    test_df = test_df.drop_duplicates(subset=['Tweet_id'])
    
    # English language only
    test_df = test_df[test_df['Language'] == 'en']
    
    # Save combined file
    clear_output(wait=True)
    output_file = 'scraped_validation_human_bot_twitter_dataset.csv'
    test_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Total tweets fetched: {len(test_df)}")
    print(f"All tweets saved in {output_file}")

    return test_df

# Run in parallel
if __name__ == "__main__":
    asyncio.run(run_parallel())
    

### Import fetched human/bot dataset + Lets take a look of the results 🌎

In [10]:
test_df = pd.read_csv("scraped_validation_human_bot_twitter_dataset.csv")

In [11]:
# Calculate unique User_id for 'bot' and 'human'
unique_bots = test_df[test_df['Label'] == 'bot']['User_id'].nunique()
unique_humans = test_df[test_df['Label'] == 'human']['User_id'].nunique()

# Calculate total unique User_id
total_unique_users = test_df['User_id'].nunique()

# Calculate the total number of rows
total_rows = test_df.shape[0]

# Print the results
print(f"Number of unique users with label 'bot': {unique_bots}")
print(f"Number of unique users with label 'human': {unique_humans}")
print(f"Total number of unique users: {total_unique_users}")
print(f"Total number of rows: {total_rows}")

Number of unique users with label 'bot': 51
Number of unique users with label 'human': 312
Total number of unique users: 363
Total number of rows: 26009


### Quick data cleaning for Setfit 🧹

In [12]:
# Find antallet af Bot tweets
bot_count = test_df[test_df['Label'] == "bot"].shape[0]

# Sample det samme antal Human tweets som Bot tweets
human_subset = test_df[test_df['Label'] == "human"].sample(n=bot_count, random_state=42)

# Behold alle Bot tweets
bot_subset = test_df[test_df['Label'] == "bot"]

# Kombinér Bot og Human subsets
balanced_test_df = pd.concat([bot_subset, human_subset]).sample(frac=1, random_state=42).reset_index(drop=True)

# Tjek den nye fordeling
print(balanced_test_df['Label'].value_counts())

test_df = balanced_test_df

# Rename columns to match SetFit model labels: (bot = 0, human = 1)
test_df['Label'] = test_df['Label'].map({'bot': 0, 'human': 1})

# define functions for cleaning the data
def normalize_text(text):
    # Expand contractions like "can't" to "cannot"
    text = contractions.fix(text)
    return text

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (but keep the text)
    text = re.sub(r'#', '', text)
    # Remove emojis and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Normalize the text (expand contractions)
    text = normalize_text(text)
    return text

# Data Cleaning
test_df['Tweet_text'] = test_df['Tweet_text'].apply(clean_text)  # custom cleaning function
test_df.dropna(subset=['Tweet_text'], inplace=True)

Label
human    3376
bot      3376
Name: count, dtype: int64


In [13]:
test_df.head()

Unnamed: 0,User_id,Tweet_id,Tweet_text,Language,Label
0,25605494,1595145980357226496,i am in my fourth trumprelated court hearing t...,en,1
1,1445492971613212676,1562051819521671168,jeff odonnell joins the conversation and talks...,en,0
2,1337302714322100226,1488619586299781124,a millionaire is not determined by what is in ...,en,0
3,49780957,1643701833288876035,uconn has won as many national championships i...,en,1
4,1339296904811843586,1818789790726168620,you are not human,en,0


In [14]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6752 entries, 0 to 6751
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   User_id     6752 non-null   int64 
 1   Tweet_id    6752 non-null   int64 
 2   Tweet_text  6752 non-null   object
 3   Language    6752 non-null   object
 4   Label       6752 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 263.9+ KB


### SetFit - Second Dataset Validation 🔬

In [15]:
logging.getLogger("setfit").setLevel(logging.WARNING)

# load saved setfit model
loaded_model = SetFitModel.from_pretrained("setfit_model")
print("Model indlæst.")

# prepare data
X_test = test_df['Tweet_text'].tolist()  # Tekstdata
y_true = test_df['Label'].astype(int).tolist()  # True labels (konverteret til int)

# create predictions
print("Laver predictions...")
y_pred = loaded_model.predict(X_test)

# calculate metrics
print("Beregner metrics...")
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

# show results..
print("\nValidation results:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print("\nDetaljeret rapport:")
print(classification_report(y_true, y_pred, target_names=["Bot", "Human"]))


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: setfit_model


Model indlæst.
Laver predictions...


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

Beregner metrics...

Validation results:
Accuracy: 0.53
Precision: 0.52
Recall: 0.80
F1-score: 0.63

Detaljeret rapport:
              precision    recall  f1-score   support

         Bot       0.57      0.27      0.37      3376
       Human       0.52      0.80      0.63      3376

    accuracy                           0.53      6752
   macro avg       0.55      0.53      0.50      6752
weighted avg       0.55      0.53      0.50      6752



In [16]:
logging.getLogger("setfit").setLevel(logging.WARNING)

# load saved setfit model
loaded_model = SetFitModel.from_pretrained("setfit_model-V3")
print("Model indlæst.")

# prepare data
X_test = test_df['Tweet_text'].tolist()  # Tekstdata
y_true = test_df['Label'].astype(int).tolist()  # True labels (konverteret til int)

# create predictions
print("Laver predictions...")
y_pred = loaded_model.predict(X_test)

# calculate metrics
print("Beregner metrics...")
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

# show results..
print("\nValidation results:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print("\nDetaljeret rapport:")
print(classification_report(y_true, y_pred, target_names=["Bot", "Human"]))


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: setfit_model-V3


Model indlæst.
Laver predictions...


Batches:   0%|          | 0/211 [00:00<?, ?it/s]

Beregner metrics...

Validation results:
Accuracy: 0.55
Precision: 0.54
Recall: 0.75
F1-score: 0.63

Detaljeret rapport:
              precision    recall  f1-score   support

         Bot       0.59      0.35      0.44      3376
       Human       0.54      0.75      0.63      3376

    accuracy                           0.55      6752
   macro avg       0.56      0.55      0.53      6752
weighted avg       0.56      0.55      0.53      6752



In [None]:
# semaphore to limit concurrent tasks globally
semaphore = asyncio.Semaphore(16)

async def fetch_user_tweets(client, user_id, label):
    """
    Fetch tweets for a specific user.
    """
    try:
        # Limit concurrent tasks with the global semaphore
        async with semaphore:
            tweets = []
            results = await client.get_user_tweets(
                user_id=user_id,
                tweet_type='Tweets',
                count=20  # Number of tweets to fetch per user
            )
            tweets.extend(results)

            # Extract relevant data
            tweet_data = [
                {
                    'User_id': tweet.user.id,
                    'Tweet_id': tweet.id,
                    'Tweet_text': tweet.text,
                    'Language': tweet.lang,
                    'Label': label
                }
                for tweet in tweets
            ]

            return tweet_data
    except Exception as e:
        return []

async def main(random_state):
    """
    Fetch tweets for a specific random_state.
    """
    try:
        # Initialize a new client for each iteration
        client = GuestClient()
        await client.activate()
        logger.info(f"Client initialized for random_state={random_state}.")

        # Load dataset and take a sample of 50 user IDs
        data = pd.read_csv('userid_labels-(twi-bot_22).csv')

        # Filter the data into 'Bot' and 'Human' groups
        bot_data = data[data['label'] == 'bot']
        human_data = data[data['label'] == 'human']

        # Sample 25 rows from each group (50% Bot and 50% Human)
        bot_sample = bot_data.sample(n=25, random_state=random_state)
        human_sample = human_data.sample(n=25, random_state=random_state)

        # Combine the samples
        sample_data = pd.concat([bot_sample, human_sample])

        # Ensure that the 'id' column does not have 'u' prefix
        sample_data['id'] = sample_data['id'].str.lstrip('u')

        all_tweets = []

        # Fetch tweets for each user in the sample
        for _, row in sample_data.iterrows():
            user_id = row['id']
            label = row['label']
            tweets = await fetch_user_tweets(client, user_id, label)
            all_tweets.extend(tweets)

        return pd.DataFrame(all_tweets)

    except Exception as e:
        logger.error(f"General error in random_state={random_state}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

async def run_parallel():
    """
    Run main() in parallel with different random_state values.
    """
    random_states = range(0, 128)
    
    # Use semaphore to limit the number of concurrent main() tasks
    async def limited_main(random_state):
        async with semaphore:
            return await main(random_state)
    
    # Create tasks with the limited_main wrapper
    tasks = [limited_main(random_state) for random_state in random_states]

    # Run the tasks in parallel
    results = await asyncio.gather(*tasks)

    # Combine all results into one DataFrame
    combined_df = pd.concat(results, ignore_index=True)

    # Remove duplicates based on 'Tweet_id'
    combined_df = combined_df.drop_duplicates(subset=['Tweet_id'])

    # English language only
    combined_df = combined_df[combined_df['Language'] == 'en']

    # Save combined file
    clear_output(wait=True)
    output_file = 'TEST_scraped_validation_human_bot_twitter_dataset.csv'
    combined_df.to_csv(output_file, index=False, encoding='utf-8')
    logger.info(f"Total tweets fetched: {len(combined_df)}")
    logger.info(f"All tweets saved in {output_file}")

    return combined_df

# Run in parallel
if __name__ == "__main__":
    asyncio.run(run_parallel())