In [9]:
import torch

# Get the number of GPUs
num_gpus = torch.cuda.device_count()

print(f"Number of GPUs Available: {num_gpus}")

# List all GPU names if GPUs are available
if num_gpus > 0:
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPUs available in the system.")


Number of GPUs Available: 1
GPU 0: NVIDIA GeForce RTX 2080 Ti


In [11]:
%pip install pandas

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd

df = pd.read_csv(r"E:\bot_detection_data.csv")
df.head()

Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention


In [15]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [17]:
import re
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [19]:
df["Tweet"] = df["Tweet"].apply(clean_text)

In [21]:
bots = df[df["Bot Label"] == 1]
humans = df[df["Bot Label"] == 0]

# Ensure both have equal count (12,500 each)
bots_sampled = bots.sample(n=5000, random_state=42)
humans_sampled = humans.sample(n=5000, random_state=42)

# Combine into a balanced dataset
df_balanced = pd.concat([bots_sampled, humans_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

# Check new dataset size
df_balanced.to_csv(r"E:\balanced_bot_dataset.csv", index=False)

In [23]:
len(df_balanced)

10000

In [25]:
%pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [26]:
import torch.nn as nn
import torch.optim as optim
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

pipe = pipeline("fill-mask", model="distilbert/distilbert-base-uncased")

Device set to use cuda:0


In [29]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
distilbert_model = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased").to(device)

In [31]:
distilbert_model.eval()

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0

In [33]:
import numpy as np
from tqdm import tqdm

def get_bert_embeddings_batch(text_list, batch_size=32):
    embeddings = []
    
    for i in tqdm(range(0, len(text_list), batch_size), desc="Processing Batches on GPU"):
        batch = text_list[i : i + batch_size]  # Get batch of tweets
        
        # ✅ Tokenize & Move Inputs to GPU
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
        
        with torch.no_grad():  # No gradients needed for inference (faster)
            outputs = distilbert_model(**inputs)

        # ✅ Extract CLS token embedding, Move to CPU only after computation
        batch_embeddings = outputs[0][:, 0, :].squeeze().cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)

In [35]:
df_balanced["BERT_Embedding"] = list(get_bert_embeddings_batch(df_balanced["Tweet"].tolist()))

# ✅ Save to CSV
df_balanced.to_csv(r"E:\bert_embeddings_dataset.csv", index=False)

print("✅ BERT embeddings computed and saved successfully!")

Processing Batches on GPU: 100%|█████████████████████████████████████████████████████| 313/313 [00:05<00:00, 62.15it/s]


✅ BERT embeddings computed and saved successfully!


In [37]:
df_balanced.head()

Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags,BERT_Embedding
0,652679,kayla00,cause wall treat dog for rock through nor follow,95,2,197,True,0,North Leonfurt,2022-04-19 09:42:22,church fire have site,"[-7.431979, -7.3733125, -7.379997, -7.3138256,..."
1,169154,lucaslauren,social hard something enough very unit pass ei...,67,0,9596,False,1,West Amandafurt,2020-04-24 06:20:45,century interview stay,"[-6.756239, -6.716732, -6.6539307, -6.67578, -..."
2,295421,tylercooke,remember finish policy write trade other plan,52,0,6962,False,1,North William,2020-02-15 20:10:11,,"[-6.580943, -6.554112, -6.509034, -6.512717, -..."
3,321577,kingstephanie,citizen stock figure surface probably their ma...,90,2,6333,True,1,Jasonhaven,2020-05-16 15:03:58,throw ok Congress treatment,"[-7.1368346, -7.0517673, -6.992747, -7.014969,..."
4,297166,hstark,sport north better black cold modern coach som...,99,0,6718,True,1,Lake Matthew,2020-02-14 01:00:34,debate program past bag,"[-6.5802073, -6.550487, -6.4497786, -6.4803257..."


In [39]:
df_balanced["Verified"] = df_balanced["Verified"].astype(int)

In [41]:
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ✅ Select structured features
structured_features = ["Retweet Count", "Mention Count", "Follower Count", "Verified"]
X_structured = df_balanced[structured_features].fillna(0).values  # Convert to NumPy array

# ✅ Normalize Structured Features (StandardScaler runs on CPU)
scaler = StandardScaler()
X_structured = scaler.fit_transform(X_structured)

# ✅ Convert BERT embeddings to NumPy array
X_bert = np.vstack(df_balanced["BERT_Embedding"].values)

# ✅ Combine BERT Embeddings + Structured Features
X_combined = np.hstack((X_bert, X_structured))

# ✅ Target Variable (Bot Label)
y = df_balanced["Bot Label"].values

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# ✅ Convert to Torch Tensors & Move to GPU (This avoids CPU processing delays)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32, device="cuda")
X_test_tensor = torch.tensor(X_test, dtype=torch.float32, device="cuda")
y_train_tensor = torch.tensor(y_train, dtype=torch.float32, device="cuda").unsqueeze(1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32, device="cuda").unsqueeze(1)

# ✅ Check if tensors are on GPU
print(f"X_train_tensor is on: {X_train_tensor.device}")  # Should print: cuda
print(f"y_train_tensor is on: {y_train_tensor.device}")  # Should print: cuda


X_train_tensor is on: cuda:0
y_train_tensor is on: cuda:0


In [None]:
print(X_structured)

In [43]:
class TwitterDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoaders
train_dataset = TwitterDataset(X_train_tensor, y_train_tensor)
test_dataset = TwitterDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
class MLPClassifier(nn.Module):
    def __init__(self, input_size):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc3(x))
        return x

# Initialize Model
input_size = X_train.shape[1]
model = MLPClassifier(input_size).to(device)

# Define Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train Model
num_epochs = 90
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

# Evaluate Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        predicted = (outputs > 0.5).float()
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Epoch [1/90], Loss: 49.2506
Epoch [2/90], Loss: 49.8375
Epoch [3/90], Loss: 49.8375
Epoch [4/90], Loss: 49.8375
Epoch [5/90], Loss: 49.8375
Epoch [6/90], Loss: 49.8375
Epoch [7/90], Loss: 49.8375
Epoch [8/90], Loss: 49.8375
Epoch [9/90], Loss: 49.8375
Epoch [10/90], Loss: 49.8375
Epoch [11/90], Loss: 49.8375
Epoch [12/90], Loss: 49.8375
Epoch [13/90], Loss: 49.8375
Epoch [14/90], Loss: 49.8375
Epoch [15/90], Loss: 49.8375
Epoch [16/90], Loss: 49.8375
Epoch [17/90], Loss: 49.8375
Epoch [18/90], Loss: 49.8375
Epoch [19/90], Loss: 49.8375
Epoch [20/90], Loss: 49.8375
Epoch [21/90], Loss: 49.8375
Epoch [22/90], Loss: 49.8375
Epoch [23/90], Loss: 49.8375
Epoch [24/90], Loss: 49.8375
Epoch [25/90], Loss: 49.8375
Epoch [26/90], Loss: 49.8375
Epoch [27/90], Loss: 49.8375
Epoch [28/90], Loss: 49.8375
Epoch [29/90], Loss: 49.8375
Epoch [30/90], Loss: 49.8375
Epoch [31/90], Loss: 49.8375
Epoch [32/90], Loss: 49.8375
Epoch [33/90], Loss: 49.8375
Epoch [34/90], Loss: 49.8375
Epoch [35/90], Loss: 49