In [1]:
import pandas as pd
df=pd.read_csv('/kaggle/input/social/Social.csv')

In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from transformers import BertTokenizer, BertModel
from PIL import Image
import requests
from io import BytesIO
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [3]:
# Transformations for Image Preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [5]:
# Data Preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from transformers import BertTokenizer, BertModel
from PIL import Image
import requests
from io import BytesIO
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# Autoencoder for Image Feature Extraction
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=3, stride=2, padding=1),  # Input from VGG16
            nn.ReLU(),
            nn.Conv2d(256, 128, kernel_size=3, stride=2, padding=1),  # Downsampling
            nn.ReLU(),
            nn.Conv2d(128, 64, kernel_size=3, stride=2, padding=1),  # Further downsampling
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 128, kernel_size=3, stride=2),  # Upsampling
            nn.ReLU(),
            nn.ConvTranspose2d(128, 256, kernel_size=3, stride=2),  # Upsampling
            nn.ReLU(),
            nn.ConvTranspose2d(256, 3, kernel_size=3, stride=2, padding=1),  # Final output
        )

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return latent, reconstructed

# Image Encoder that uses VGG16 and Autoencoder
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        vgg16 = models.vgg16(pretrained=True)
        self.vgg16_features = vgg16.features
        self.autoencoder = Autoencoder()
    
    def forward(self, x):
        with torch.no_grad():
            x = self.vgg16_features(x)  # Get feature maps from VGG16
        latent, _ = self.autoencoder(x)  # Pass through autoencoder
        return latent.view(latent.size(0), -1)  # Flatten for further processing

# Text Encoder (e.g., BERT for Tweets)
class TextEncoder(nn.Module):
    def __init__(self):
        super(TextEncoder, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.pooler_output  # CLS token embedding

# Transformer Encoder for Combined Features
class TransformerEncoder(nn.Module):
    def __init__(self, input_size, num_heads=4, ff_size=512, num_layers=2):
        super(TransformerEncoder, self).__init__() 
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_size, nhead=num_heads, dim_feedforward=ff_size
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x):
        x = self.transformer_encoder(x)
        return x

# Bot Classification Model
class BotDetectionModel(nn.Module):
    def __init__(self):
        super(BotDetectionModel, self).__init__()
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()
        self.transformer_encoder = TransformerEncoder(input_size=256 * 7 * 7 + 768)  # Image features + Text embeddings
        self.dense_layer = nn.Sequential(
            nn.Linear(256 * 7 * 7 + 768 + 2, 128),  # Adding followers and following counts
            nn.ReLU(),
            nn.Linear(128, 1),  # Binary classification (Bot or Not)
            nn.Sigmoid()
        )
    
    def forward(self, images, input_ids, attention_mask, followers_count, following_count):
        # Process image inputs through VGG16 + Autoencoder
        image_features = self.image_encoder(images)
        
        # Process text inputs through BERT
        text_embeddings = self.text_encoder(input_ids, attention_mask)
        
        # Concatenate image and text embeddings
        combined_features = torch.cat((image_features, text_embeddings), dim=1)
        
        # Forward through Transformer Encoder
        transformer_output = self.transformer_encoder(combined_features.unsqueeze(1))  # Add sequence dimension
        
        # Add follower and following counts
        combined_with_counts = torch.cat((transformer_output.squeeze(1), followers_count, following_count), dim=1)
        
        # Dense layer for final classification
        output = self.dense_layer(combined_with_counts)
        return output

# Custom Dataset Class for Bot Detection
class BotDetectionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform
    
    def load_image_from_url(self, url):
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content)).convert('RGB')
            if self.transform:
                img = self.transform(img)
            return img
        except:
            return torch.zeros(3, 224, 224)  # Handle invalid or missing images
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        # Load and preprocess images
        profile_image = self.load_image_from_url(row['profile_image_url'])
        banner_image = self.load_image_from_url(row['profile_banner_url'])
        post_images = torch.stack([self.load_image_from_url(url) for url in eval(row['posts_url'])], dim=0).mean(0)  # Average across post images
        
        images = torch.stack([profile_image, banner_image, post_images], dim=0).mean(0)  # Average profile, banner, and post images
        
        # Tokenize the tweet text
        tweet_text = row['Tweet']
        encoding = self.tokenizer(tweet_text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        
        # Followers and following counts
        followers_count = torch.tensor([row['followers_count']], dtype=torch.float32)
        following_count = torch.tensor([row['friends_count']], dtype=torch.float32)

        # Label (0: Human, 1: Bot)
        label = torch.tensor(1 if row['result'] == 'bot' else 0, dtype=torch.float32)
        
        return images, input_ids, attention_mask, followers_count, following_count, label

In [9]:
# Transformations for Image Preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [10]:
# Assume 'df' is the pandas DataFrame containing your dataset
df = pd.read_csv('/kaggle/input/social/Social.csv')

In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from transformers import BertTokenizer, BertModel
from PIL import Image
import requests
from io import BytesIO
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# VGG16 with Autoencoder for Image Feature Extraction
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        vgg16 = models.vgg16(pretrained=True)
        self.vgg16_features = vgg16.features
        
        # Encoder part of Autoencoder
        self.encoder = nn.Sequential(
            nn.Linear(512 * 7 * 7, 1024),  # Assuming VGG output size (512, 7, 7)
            nn.ReLU(),
            nn.Linear(1024, 256)  # Latent space representation
        )
        
        # Decoder part of Autoencoder
        self.decoder = nn.Sequential(
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512 * 7 * 7),
            nn.Sigmoid()  # Reconstruct the input (if needed during training)
        )
    
    def forward(self, x):
        # Extract features from VGG16
        with torch.no_grad():
            x = self.vgg16_features(x)
        x = torch.flatten(x, start_dim=1)
        
        # Encoder to get latent representation
        latent_rep = self.encoder(x)
        
        # Decoder for training (optional, only needed if you want to reconstruct input during training)
        reconstructed = self.decoder(latent_rep)
        
        return latent_rep, reconstructed

# Text Encoder (e.g., BERT for Tweets)
class TextEncoder(nn.Module):
    def __init__(self):
        super(TextEncoder, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.pooler_output  # CLS token embedding

# Transformer Encoder for Combined Features
class TransformerEncoder(nn.Module):
    def __init__(self, input_size, num_heads=4, ff_size=512, num_layers=2):
        super(TransformerEncoder, self).__init__() 
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_size, nhead=num_heads, dim_feedforward=ff_size
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x):
        x = self.transformer_encoder(x)
        return x

# Bot Classification Model
class BotDetectionModel(nn.Module):
    def __init__(self):
        super(BotDetectionModel, self).__init__()
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()
        self.transformer_encoder = TransformerEncoder(input_size=256 + 768)  # Image features (256) + Text embeddings (768)
        self.dense_layer = nn.Sequential(
            nn.Linear(256 + 768 + 2, 128),  # Adding followers and following counts
            nn.ReLU(),
            nn.Linear(128, 1),  # Binary classification (Bot or Not)
            nn.Sigmoid()
        )
    
    def forward(self, images, input_ids, attention_mask, followers_count, following_count):
        # Process image inputs through VGG16 + Autoencoder (latent representation only)
        image_features, _ = self.image_encoder(images)
        
        # Process text inputs through BERT
        text_embeddings = self.text_encoder(input_ids, attention_mask)
        
        # Concatenate image and text embeddings
        combined_features = torch.cat((image_features, text_embeddings), dim=1)
        
        # Forward through Transformer Encoder
        transformer_output = self.transformer_encoder(combined_features.unsqueeze(1))  # Add sequence dimension
        
        # Add follower and following counts
        combined_with_counts = torch.cat((transformer_output.squeeze(1), followers_count, following_count), dim=1)
        
        # Dense layer for final classification
        output = self.dense_layer(combined_with_counts)
        return output

# Custom Dataset Class for Bot Detection
class BotDetectionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform
    
    def load_image_from_url(self, url):
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content)).convert('RGB')
            if self.transform:
                img = self.transform(img)
            return img
        except:
            return torch.zeros(3, 224, 224)  # Handle invalid or missing images
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        # Load and preprocess images
        profile_image = self.load_image_from_url(row['profile_image_url'])
        banner_image = self.load_image_from_url(row['profile_banner_url'])
        post_images = torch.stack([self.load_image_from_url(url) for url in eval(row['posts_url'])], dim=0).mean(0)  # Average across post images
        
        images = torch.stack([profile_image, banner_image, post_images], dim=0).mean(0)  # Average profile, banner, and post images
        
        # Tokenize the tweet text
        tweet_text = row['Tweet']
        encoding = self.tokenizer(tweet_text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        
        # Followers and following counts
        followers_count = torch.tensor([row['followers_count']], dtype=torch.float32)
        following_count = torch.tensor([row['friends_count']], dtype=torch.float32)

        # Label (0: Human, 1: Bot)
        label = torch.tensor(1 if row['result'] == 'bot' else 0, dtype=torch.float32)
        
        return images, input_ids, attention_mask, followers_count, following_count, label

# Transformations for Image Preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Data Preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Assume 'df' is the pandas DataFrame contai
# Creating Dataset and DataLoader
dataset = BotDetectionDataset(df, tokenizer, transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Training the Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BotDetectionModel().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [31]:
# Training Loop
for epoch in range(9):
    model.train()
    running_loss = 0.0
    for images, input_ids, attention_mask, followers_count, following_count, labels in dataloader:
        images = images.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        followers_count = followers_count.to(device)
        following_count = following_count.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(images, input_ids, attention_mask, followers_count, following_count)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/10], Loss: {running_loss/len(dataloader)}")

Epoch [1/10], Loss: 1.0211989841892481
Epoch [2/10], Loss: 0.8612674545519841
Epoch [3/10], Loss: 1.3297130643904949
Epoch [4/10], Loss: 1.158986344436033
Epoch [5/10], Loss: 0.5306251735530608
Epoch [6/10], Loss: 0.4608929750000876
Epoch [7/10], Loss: 0.3460945374338153
Epoch [8/10], Loss: 1.06889152570109
Epoch [9/10], Loss: 0.9070639966773585


In [32]:
# Inference on Specific Account
model.eval()
with torch.no_grad():
    # Sample account info (as you would provide during inference)
    sample_row = df.iloc[0]
    images, input_ids, attention_mask, followers_count, following_count, _ = dataset[0]
    
    # Add batch dimension and move to device
    images = images.unsqueeze(0).to(device)
    input_ids = input_ids.unsqueeze(0).to(device)
    attention_mask = attention_mask.unsqueeze(0).to(device)
    followers_count = followers_count.unsqueeze(0).to(device)
    following_count = following_count.unsqueeze(0).to(device)
    
    # Model prediction
    prediction = model(images, input_ids, attention_mask, followers_count, following_count)
    print(f"Bot Probability: {prediction.item()}")

Bot Probability: 0.0064884936437010765


3.983754140790552e-05


In [26]:
model.eval()
with torch.no_grad():
    # Sample account info (as you would provide during inference)
    sample_row = df.iloc[500]
    print(sample_row)
    images, input_ids, attention_mask, followers_count, following_count, _ = dataset[500]
    
    # Add batch dimension and move to device
    images = images.unsqueeze(0).to(device)
    input_ids = input_ids.unsqueeze(0).to(device)
    attention_mask = attention_mask.unsqueeze(0).to(device)
    followers_count = followers_count.unsqueeze(0).to(device)
    following_count = following_count.unsqueeze(0).to(device)
    
    # Model prediction
    prediction = model(images, input_ids, attention_mask, followers_count, following_count)
    print(f"Bot Probability: {prediction.item()}")

id                                                            468385317
created_at_year                                                    2019
screen_name                                                MariaNicola2
profile_image_url     http://pbs.twimg.com/profile_images/7956854877...
profile_banner_url    https://pbs.twimg.com/profile_banners/46838531...
followers_count                                                     287
friends_count                                                       165
result                                                              bot
posts_url             ['https://pbs.twimg.com/media/Ecjk1g8WAAIhxrV?...
Tweet                 Only enough yet popular determine internationa...
Name: 500, dtype: object
Bot Probability: 0.9998499155044556


In [16]:
# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, 'bot_detection_model.pth')


In [26]:
# Load the saved state into the model and optimizer
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Set the model to evaluation mode
model.eval()

BotDetectionModel(
  (image_encoder): ImageEncoder(
    (vgg16_features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15)

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch


In [33]:
def evaluate_model(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    all_labels = []
    all_predictions = []
    all_probabilities = []

    with torch.no_grad():
        for images, input_ids, attention_mask, followers_count, following_count, labels in dataloader:
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            followers_count = followers_count.to(device)
            following_count = following_count.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images, input_ids, attention_mask, followers_count, following_count)
            probabilities = outputs.squeeze().cpu().numpy()
            predictions = (outputs.squeeze() > 0.5).int().cpu().numpy()
            labels = labels.cpu().numpy()

            # Collect all labels and predictions
            all_labels.extend(labels)
            all_predictions.extend(predictions)
            all_probabilities.extend(probabilities)

    # Convert lists to numpy arrays
    all_labels = np.array(all_labels)
    all_predictions = np.array(all_predictions)
    all_probabilities = np.array(all_probabilities)

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions)
    recall = recall_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions)
    roc_auc = roc_auc_score(all_labels, all_probabilities)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"AUC-ROC: {roc_auc}")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
    }


In [34]:
import numpy as np
df1=pd.read_csv('/kaggle/input/testing/Test.csv')

dataset1 = BotDetectionDataset(df1, tokenizer, transform=transform)

dataloader = DataLoader(dataset1, batch_size=4, shuffle=True)

In [35]:
# Assume you've already loaded your model and data

# Call the evaluation function
metrics = evaluate_model(model, dataloader, device)

# Access individual metrics
print("Final Model Metrics:")
print(metrics)


Accuracy: 0.9019607843137255
Precision: 0.75
Recall: 0.9230769230769231
F1 Score: 0.8275862068965517
AUC-ROC: 0.9251012145748988
Final Model Metrics:
{'accuracy': 0.9019607843137255, 'precision': 0.75, 'recall': 0.9230769230769231, 'f1': 0.8275862068965517, 'roc_auc': 0.9251012145748988}
