In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
import os
import matplotlib.pyplot as plt
import seaborn as sns
#TODO: CHANGE THE FOLDER and SAVE PATHS
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use only GPU 0
# Check if the GPU is being used
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available, using CPU instead.")
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()
# Load Data
folder_path_train = "..\\data\\cleaned\\train"
folder_path_test = "data\\cleaned\\test"
names = ["Facebook", "Reddit", "Twitter", "Youtube"]
dfs_train = {n: pd.read_csv(os.path.join(folder_path_train, f"{n.lower()}_train.csv")) for n in names}
dfs_test = {n: pd.read_csv(os.path.join(folder_path_test, f"{n.lower()}_test.csv")) for n in names}

ImportError: cannot import name 'get_full_repo_name' from 'huggingface_hub' (c:\Users\haniw\anaconda3\envs\ImageProcessing\Lib\site-packages\huggingface_hub\__init__.py)

In [None]:
# Train models on one platform and test on others
def train_and_evaluate(train_name, test_names, model_name, model_class, tokenizer_class, save_path):
    # Prepare training data
    df_train = dfs_train[train_name]
    df_train['label'] = df_train['label'].astype(int)
    train_dataset = Dataset.from_pandas(df_train)

    # Tokenization
    tokenizer = tokenizer_class.from_pretrained(model_name)

    def tokenize_function(examples):
        return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_train = tokenized_train.remove_columns(['text'])
    tokenized_train.set_format('torch')

    # Create DataLoader
    train_loader = DataLoader(tokenized_train, batch_size=4, shuffle=True)  # Reduced batch size

    # Load and Train Model
    model = model_class.from_pretrained(model_name, num_labels=2)
    model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * 3  # Assuming 3 epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    def train_model(model, train_loader, optimizer, scheduler, num_epochs=3, accumulation_steps=4):  # Increased accumulation steps
        for epoch in range(num_epochs):
            model.train()
            total_train_loss = 0

            for i, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{num_epochs}")):
                inputs = {key: val.to(device) for key, val in batch.items() if key != 'label'}
                labels = batch['label'].to(device)
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss / accumulation_steps
                total_train_loss += loss.item()
                loss.backward()

                if (i + 1) % accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    torch.cuda.empty_cache()  # Clear GPU cache periodically

            avg_train_loss = total_train_loss / len(train_loader)
            print(f'Epoch {epoch + 1}/{num_epochs}')
            print(f'Train Loss: {avg_train_loss:.4f}')

    train_model(model, train_loader, optimizer, scheduler)

    # Save the model and tokenizer
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    # Evaluate on test datasets
    results = {}
    for test_name in test_names:
        df_test = dfs_test[test_name]
        df_test['label'] = df_test['label'].astype(int)
        test_dataset = Dataset.from_pandas(df_test)

        tokenized_test = test_dataset.map(tokenize_function, batched=True)
        tokenized_test = tokenized_test.remove_columns(['text'])
        tokenized_test.set_format('torch')
        test_loader = DataLoader(tokenized_test, batch_size=4, shuffle=False)  # Reduced batch size

        def evaluate_model(model, test_loader):
            model.eval()
            predictions = []
            true_labels = []
            
            with torch.no_grad():
                for batch in tqdm(test_loader, desc="Evaluating"):
                    inputs = {key: val.to(device) for key, val in batch.items() if key != 'label'}
                    labels = batch['label'].to(device)
                    outputs = model(**inputs)
                    logits = outputs.logits
                    preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
                    labels = labels.detach().cpu().numpy()
                    predictions.extend(preds)
                    true_labels.extend(labels)
            
            return true_labels, predictions

        true_labels, predictions = evaluate_model(model, test_loader)
        results[test_name] = classification_report(true_labels, predictions, output_dict=True)

        # Plot confusion matrix
        def plot_confusion_matrix(true_labels, predictions, class_names, platform_name):
            cm = confusion_matrix(true_labels, predictions)
            cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
            plt.title(f"Confusion Matrix - {platform_name}")
            plt.ylabel("Actual")
            plt.xlabel("Predicted")
            plt.show()

        plot_confusion_matrix(true_labels, predictions, ['Class 0', 'Class 1'], test_name)

    return results



In [None]:

# Train on dataset of a platform and test on other platforms using BERT
bert_results_facebook = train_and_evaluate("Facebook", ["Reddit", "Twitter", "Youtube"], "bert-base-uncased", BertForSequenceClassification, BertTokenizer, save_path='ml/bert_model')

In [None]:
bert_results_reddit = train_and_evaluate("Reddit", ["Facebook", "Twitter", "Youtube"], "bert-base-uncased", BertForSequenceClassification, BertTokenizer, save_path='ml/bert_model')


In [None]:
bert_results_twitter = train_and_evaluate("Twitter", ["Reddit", "Facebook", "Youtube"], "bert-base-uncased", BertForSequenceClassification, BertTokenizer, save_path='ml/bert_model')


In [None]:
bert_results_youtube = train_and_evaluate("Youtube", ["Reddit", "Facebook", "Twitter"], "bert-base-uncased", BertForSequenceClassification, BertTokenizer, save_path='ml/bert_model')


In [None]:
# Function to plot performance metrics across platforms
def plot_performance(results, model_name):
    platforms = list(results.keys())
    metrics = ['accuracy', 'precision', 'recall', 'f1-score']

    for metric in metrics:
        values = [results[platform]['weighted avg'][metric] for platform in platforms]
        plt.figure(figsize=(10, 6))
        plt.bar(platforms, values, color='skyblue')
        plt.xlabel('Platform')
        plt.ylabel(metric.capitalize())
        plt.title(f'{metric.capitalize()} across platforms for {model_name}')
        plt.ylim(0, 1)
        for i, v in enumerate(values):
            plt.text(i, v + 0.02, f"{v:.2f}", ha='center')
        plt.show()

# Plot performance metrics
print("BERT Results facebook:")
plot_performance(bert_results_facebook, "BERT")



In [None]:
# Plot performance metrics
print("BERT Results reddit:")
plot_performance(bert_results_reddit, "BERT")

# Plot performance metrics
print("BERT Results youtube:")
plot_performance(bert_results_youtube, "BERT")

# Plot performance metrics
print("BERT Results twitter:")
plot_performance(bert_results_twitter, "BERT")