**TRAIN OBJECTIFICATION CLASSIFIER**

In [None]:
pip install scikeras


**1. NGRAM**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# Load dataset
# Ensure the dataset is in a DataFrame with 'text', 'neutral', and 'negative' columns
data = pd.read_csv("/INSERT-DATA-PATH.csv")


In [None]:
data[['neutral', 'negative']] = data[['neutral', 'negative']].fillna(0).astype(int)

In [None]:
print(data['neutral'].unique())
print(data['negative'].unique())


In [None]:
# Define TF-IDF vectorizer with N-grams (1,2,3)
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
X = vectorizer.fit_transform(data['text']).toarray()
y_neutral = data['neutral'].values
y_negative = data['negative'].values

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_neutral_tensor = torch.tensor(y_neutral, dtype=torch.float32)
y_negative_tensor = torch.tensor(y_negative, dtype=torch.float32)

# Define PyTorch neural network class
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_rate):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc3(x))
        return x

# Perform 5-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store best params
best_params = {}

# Function to train and evaluate model
def train_and_evaluate(y_tensor, label_name):
    input_dim = X.shape[1]
    hidden_dim = 128
    dropout_rate = 0.3
    batch_size = 32
    epochs = 10

    dataset = TensorDataset(X_tensor, y_tensor.unsqueeze(1))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model = TextClassifier(input_dim, hidden_dim, dropout_rate)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_X).squeeze()
            loss = criterion(outputs, batch_y.squeeze())
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        y_pred_proba = model(X_tensor).squeeze().numpy()

    y_pred = (y_pred_proba > 0.5).astype(int)

    # Metrics
    roc_auc = roc_auc_score(y_tensor.numpy(), y_pred_proba)
    acc = accuracy_score(y_tensor.numpy(), y_pred)
    f1 = f1_score(y_tensor.numpy(), y_pred)
    precision = precision_score(y_tensor.numpy(), y_pred)
    recall = recall_score(y_tensor.numpy(), y_pred)
    cm = confusion_matrix(y_tensor.numpy(), y_pred)

    # Store best params
    best_params[label_name] = {
        "hidden_dim": hidden_dim,
        "dropout_rate": dropout_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    # Display results
    print(f"{label_name} Classification Results:")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("Confusion Matrix:")
    print(cm)


# Train models separately for 'neutral' and 'negative'
train_and_evaluate(y_neutral_tensor, "Neutral")
train_and_evaluate(y_negative_tensor, "Negative")

# Print best parameters
print("Best Parameters for Each Model:")
for label, params in best_params.items():
    print(f"{label}: {params}")




**2. ROBERTA**

In [None]:
# Load dataset
data = pd.read_excel("/INSERT-DATA-PATH")
data[['neutral', 'negative']] = data[['neutral', 'negative']].fillna(0).astype(int)
print(data['neutral'].unique())
print(data['negative'].unique())

[1 0]
[1 0]


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import warnings
import json
from tqdm import tqdm

# **Suppress Hugging Face Warnings**
warnings.filterwarnings("ignore", message="Some weights of RobertaModel were not initialized")


# Data Loading and Preparation

# Convert labels to float for BCELoss
y_neutral = data['neutral'].values.astype(float)
y_negative = data['negative'].values.astype(float)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float32),
            "text": self.texts[idx]
        }

train_texts_neutral, test_texts_neutral, train_neutral, test_neutral = train_test_split(
    data["text"], y_neutral, test_size=0.2, random_state=42)
train_texts_negative, test_texts_negative, train_negative, test_negative = train_test_split(
    data["text"], y_negative, test_size=0.2, random_state=42)

train_dataset_neutral = TextDataset(train_texts_neutral.tolist(), train_neutral)
test_dataset_neutral = TextDataset(test_texts_neutral.tolist(), test_neutral)
train_dataset_negative = TextDataset(train_texts_negative.tolist(), train_negative)
test_dataset_negative = TextDataset(test_texts_negative.tolist(), test_negative)


# Define the Model

class RobertaClassifier(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(RobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(self.roberta.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = self.dropout(outputs.pooler_output)
        x = self.fc(x)
        return self.sigmoid(x).squeeze()


# Hyperparameter Grid

param_grid = {
    "dropout_rate": [0.3, 0.4],
    "learning_rate": [2e-5, 3e-5],
    "epochs": [2, 3]
}


# Training Function with Debugging

def train_final_model(train_dataset, test_dataset, model_name, param_grid, global_pbar):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f"\n🚀 Starting training for {model_name} on {device}")

    best_model = None
    best_loss = float("inf")
    best_params = None
    misclassified_samples = []

    for dropout_rate in param_grid["dropout_rate"]:
        for learning_rate in param_grid["learning_rate"]:
            for epochs in param_grid["epochs"]:
                print(f"\n🔹 Training {model_name} with dropout={dropout_rate}, lr={learning_rate}, epochs={epochs}")
                model = RobertaClassifier(dropout_rate=dropout_rate).to(device)
                optimizer = optim.Adam(model.parameters(), lr=learning_rate)
                criterion = nn.BCELoss()
                train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

                # Debug: Check if DataLoader is working
                first_batch = next(iter(train_loader))
                print(f"🔍 Sample Batch: Input IDs Shape: {first_batch['input_ids'].shape}, Labels: {first_batch['labels'][:5]}")

                for epoch in range(epochs):
                    model.train()
                    total_loss = 0
                    for batch in train_loader:
                        input_ids = batch["input_ids"].to(device)
                        attention_mask = batch["attention_mask"].to(device)
                        labels_batch = batch["labels"].to(device)

                        optimizer.zero_grad()
                        outputs = model(input_ids, attention_mask)

                        loss = criterion(outputs, labels_batch)
                        loss.backward()
                        optimizer.step()

                        total_loss += loss.item()
                        global_pbar.update(1)

                        # Debug: Print loss
                        if global_pbar.n % 100 == 0:
                            print(f"⚡ Loss: {loss.item():.4f}")

                    avg_train_loss = total_loss / len(train_loader)

                    # Evaluate on test set
                    model.eval()
                    correct, total = 0, 0
                    misclassified = []
                    with torch.no_grad():
                        test_loader = DataLoader(test_dataset, batch_size=16)
                        for batch in test_loader:
                            input_ids = batch["input_ids"].to(device)
                            attention_mask = batch["attention_mask"].to(device)
                            labels_batch = batch["labels"].to(device)

                            outputs = model(input_ids, attention_mask)
                            preds = (outputs > 0.5).float()

                            total += labels_batch.size(0)
                            correct += (preds == labels_batch).sum().item()

                            # Log misclassified samples
                            for i in range(len(preds)):
                                if preds[i] != labels_batch[i]:
                                    misclassified.append(batch["text"][i])

                    accuracy = correct / total
                    avg_test_loss = total_loss / len(test_loader)

                    # Save best model
                    if avg_test_loss < best_loss:
                        best_loss = avg_test_loss
                        best_model = model.state_dict()
                        best_params = {
                            "dropout_rate": dropout_rate,
                            "learning_rate": learning_rate,
                            "epochs": epochs
                        }
                        misclassified_samples = misclassified[:3]


    # Save best model
    model_save_path = f"{save_dir}/{model_name}_best.pth"
    torch.save(best_model, model_save_path)

    # Save misclassified samples
    misclassified_save_path = f"{save_dir}/{model_name}_misclassified.json"
    with open(misclassified_save_path, "w") as f:
        json.dump(misclassified_samples, f, indent=4)

    return best_params


# Training


save_dir = "/INSERT-PATH"
os.makedirs(save_dir, exist_ok=True)

total_batches = (
    (len(train_dataset_neutral) + len(train_dataset_negative)) // 16
    * sum(param_grid["epochs"]) * len(param_grid["dropout_rate"]) * len(param_grid["learning_rate"])
)

with tqdm(total=total_batches, desc="Overall Training Progress", unit="batch") as global_pbar:
    best_params_neutral = train_final_model(train_dataset_neutral, test_dataset_neutral, "Neutral_Classifier", param_grid, global_pbar)
    best_params_negative = train_final_model(train_dataset_negative, test_dataset_negative, "Negative_Classifier", param_grid, global_pbar)



In [None]:
import torch
import json
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score
from transformers import RobertaTokenizer

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Load dataset for testing
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float32),
            "text": self.texts[idx]
        }

# Load test datasets
test_dataset_neutral = TextDataset(test_texts_neutral.tolist(), test_neutral)
test_dataset_negative = TextDataset(test_texts_negative.tolist(), test_negative)

def evaluate_model(model_path, test_dataset, model_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load best model
    model = RobertaClassifier()
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    test_loader = DataLoader(test_dataset, batch_size=16)
    all_preds, all_labels, all_probs, misclassified = [], [], [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels_batch = batch["labels"].cpu().numpy()

            outputs = model(input_ids, attention_mask)
            probs = outputs.cpu().numpy()  # Raw probabilities before thresholding
            preds = (probs > 0.5).astype(int)  # Convert to binary predictions

            all_preds.extend(preds)
            all_probs.extend(probs)
            all_labels.extend(labels_batch)

            # Store misclassified samples
            for i in range(len(preds)):
                if preds[i] != labels_batch[i]:
                    misclassified.append({
                        "text": batch["text"][i],
                        "true_label": int(labels_batch[i]),
                        "predicted_label": int(preds[i]),
                        "predicted_prob": float(probs[i])  # Include probability score
                    })

    # Convert lists to numpy arrays
    all_preds = np.array(all_preds)
    all_probs = np.array(all_probs).flatten()
    all_labels = np.array(all_labels)

    # Compute Metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    conf_matrix = confusion_matrix(all_labels, all_preds)
    roc_auc = roc_auc_score(all_labels, all_probs)

    # Print Metrics
    print(f"Final Classification Metrics for {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")



# Evaluate models & print misclassified samples
evaluate_model("/MODEL-PATH", test_dataset_neutral, "Neutral_Classifier")
evaluate_model("/MODEL-PATH", test_dataset_negative, "Negative_Classifier")
