In [13]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split


In [14]:
# Automatically use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

MODEL_NAME = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def tokenize_function(examples):
    return tokenizer(
        examples["acctdesc"],
        padding="max_length",
        truncation=True,
        max_length=128  # safe for bios and RAM
    )

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [16]:
file_path = '../data/labeled_sunset.csv'
df = pd.read_csv(file_path)

# To view the first few rows of the dataframe
print(df.head())

  df = pd.read_csv(file_path)


   Unnamed: 0              userid       username  \
0           3            22240612  AlArabiya_Eng   
1           4             6135622     dw_espanol   
2           5  848416437030985728   ChangshaCity   
3           8  984429894829592576   pulsoguayaco   
4           9  807095565028917248  linjianyangbe   

                                            acctdesc  \
0  The Arab world‚Äôs leading source of global news...   
1  Desde Alemania para Am√©rica Latina. Todo lo qu...   
2  Changsha, the capital of central China‚Äôs Hunan...   
3  üåê‚úàBlog de aviaci√≥n, viajes y econom√≠a para via...   
4  Nature heals. Birding in China: best photos+vi...   

                     location  following  followers  totaltweets  \
0                       Dubai         46     921780       324925   
1             Berlin, Germany        160    1266110       157669   
2  People's Republic of China        261      47826         3634   
3                        üá™üá®üá∫üá∏         75        326   

In [17]:
# Map string labels to numeric
df = df.drop_duplicates(subset=['id'], keep='first').copy()
df_clean = df[["acctdesc", "label"]].dropna()
label_map = {"human": 0, "bot": 1}
df_clean["label"] = df_clean["label"].map(label_map)

# Check result
print(df_clean["label"].value_counts())

# Drop rows with missing descriptions
df_clean = df_clean.dropna(subset=["acctdesc"]).copy()

# Optional: convert labels to int (in case they're strings)
df_clean['label'] = df_clean['label'].astype(int)

# Check balance
print(df_clean['label'].value_counts())


label
0    92940
1     5336
Name: count, dtype: int64
label
0    92940
1     5336
Name: count, dtype: int64


In [18]:
# Split first
train_df, test_df = train_test_split(
    df_clean, test_size=0.2, stratify=df_clean["label"], random_state=42
)

# Oversample only training bots
bots_train = train_df[train_df["label"] == 1]
humans_train = train_df[train_df["label"] == 0]

bots_oversampled = bots_train.sample(n=30000, replace=True, random_state=42)
del train_df
del df_clean
# Combine and shuffle
train_balanced = pd.concat([humans_train, bots_oversampled]).sample(frac=1.0, random_state=42).reset_index(drop=True)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_balanced)
test_dataset = Dataset.from_pandas(test_df)
del train_balanced
del test_df

In [19]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = test_dataset.map(tokenize_function, batched=True)


train_tokenized.save_to_disk("tokenized_data_train_balanced/train")
val_tokenized.save_to_disk("tokenized_data_train_balanced/test")

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 104351/104351 [00:14<00:00, 7418.52 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19656/19656 [00:02<00:00, 7530.94 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 104351/104351 [00:00<00:00, 622522.23 examples/s]
Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19656/19656 [00:00<00:00, 504542.41 examples/s]


In [20]:
training_args = TrainingArguments(
    output_dir="./trained-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    report_to="none"  # avoids TensorBoard warnings in Colab
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()



  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3944,0.219112,0.918782,0.832621,0.85697,0.809617
2,0.1603,0.180762,0.953961,0.910429,0.884669,0.937734
3,0.0711,0.173906,0.96482,0.931289,0.908274,0.955501


TrainOutput(global_step=18441, training_loss=0.2086014249863797, metrics={'train_runtime': 7973.4151, 'train_samples_per_second': 37.005, 'train_steps_per_second': 2.313, 'total_flos': 9771325194461184.0, 'train_loss': 0.2086014249863797, 'epoch': 3.0})

In [21]:
trainer.save_model("models/trained_userdesc_v1")
