In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Code

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import AdamW, get_linear_schedule_with_warmup, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, roc_auc_score

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomamunka/Suicide_Detection.csv')

# Convert 'class' to numerical format
df['class'] = df['class'].map({'suicide': 1, 'non-suicide': 0})

# Split dataset into texts and labels
texts, labels = df['text'].tolist(), df['class'].tolist()

# Split data into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts,
                                                                      labels,
                                                                      test_size=0.15,
                                                                      random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts,
                                                                    train_labels,
                                                                    test_size=0.1764,
                                                                    random_state=42)  # 0.1764 ~ 0.15 / (1 - 0.15)

# XLNet setup
config = XLNetConfig.from_pretrained("xlnet-base-cased", dropout=0.3)
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", config=config)
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

# Function to encode texts and convert to torch tensors
def encode_texts(texts):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)  # Limit max length
    seq = torch.tensor(encodings['input_ids'])
    mask = torch.tensor(encodings['attention_mask'])
    return seq, mask

train_seq, train_mask = encode_texts(train_texts)
val_seq, val_mask = encode_texts(val_texts)
test_seq, test_mask = encode_texts(test_texts)

train_y = torch.tensor(train_labels)
val_y = torch.tensor(val_labels)
test_y = torch.tensor(test_labels)

# Dataloader preparation
batch_size = 8
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=0.0008)

# Early Stopping parameters
patience = 5
best_val_loss = float('inf')
no_improvement = 0
best_model = None

training_losses = []
validation_losses = []
validation_metrics = []

epochs = 20
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader)*epochs)
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader):
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch

        model.zero_grad()

        outputs = model(sent_id, attention_mask=mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        scheduler.step()

    avg_loss = total_loss / len(train_dataloader)
    training_losses.append(avg_loss)
    print(f'Epoch {epoch+1}, Training Loss: {avg_loss}')

    # Validation phase
    model.eval()
    total_val_loss = 0
    for batch in val_dataloader:
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch

        with torch.no_grad():
            outputs = model(sent_id, attention_mask=mask, labels=labels)
            loss = outputs[0]
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    validation_losses.append(avg_val_loss)
    print(f'Epoch {epoch+1}, Validation Loss: {avg_val_loss}')

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improvement = 0
        best_model = model.state_dict()
        torch.save(best_model, '/content/drive/MyDrive/Colab Notebooks/Diplomamunka/Models/my_best_model_xlnet.pt')  # Save it
    else:
        no_improvement += 1
        if no_improvement == patience:
            print(f'Stopping early after {epoch+1} epochs due to no improvement in validation loss.')
            break

# Load the best model
model.load_state_dict(best_model)

# Evaluation function to calculate metrics
def evaluate_model(dataloader):
    model.eval()

    predictions, true_labels = [], []

    for batch in tqdm(dataloader):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch

        with torch.no_grad():
            outputs = model(sent_id, attention_mask=mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            predictions.extend(np.argmax(logits, axis=1))
            true_labels.extend(label_ids)

    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    mcc = matthews_corrcoef(true_labels, predictions)

    return accuracy, precision, recall, f1, mcc

# Compute metrics for train and test sets
train_metrics = evaluate_model(train_dataloader)
val_metrics = evaluate_model(val_dataloader)
test_metrics = evaluate_model(test_dataloader)

print("Train Metrics: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1: {:.4f}, MCC: {:.4f}".format(*train_metrics))
print("Validation Metrics: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1: {:.4f}, MCC: {:.4f}".format(*val_metrics))
print("Test Metrics: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1: {:.4f}, MCC: {:.4f}".format(*test_metrics))

# ------------------------------
def get_predictions(dataloader):
    model.eval()
    predictions = []
    true_labels = []

    for batch in tqdm(dataloader):
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch

        with torch.no_grad():
            outputs = model(sent_id, attention_mask=mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            predictions.extend(logits)
            true_labels.extend(label_ids)

    return predictions, true_labels

# Get predictions for train, validation, and test sets
train_predictions, train_true_labels = get_predictions(train_dataloader)
val_predictions, val_true_labels = get_predictions(val_dataloader)
test_predictions, test_true_labels = get_predictions(test_dataloader)

# Calculate ROC-AUC scores for train, validation, and test sets
train_roc_auc = roc_auc_score(train_true_labels, train_predictions)
val_roc_auc = roc_auc_score(val_true_labels, val_predictions)
test_roc_auc = roc_auc_score(test_true_labels, test_predictions)

# Calculate confusion matrix for train, validation, and test sets
train_confusion_matrix = confusion_matrix(train_true_labels, np.argmax(train_predictions, axis=1))
val_confusion_matrix = confusion_matrix(val_true_labels, np.argmax(val_predictions, axis=1))
test_confusion_matrix = confusion_matrix(test_true_labels, np.argmax(test_predictions, axis=1))

# Save confusion matrices to CSV files
pd.DataFrame(train_confusion_matrix).to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomamunka/Results/train_confusion_matrix_xlnet.csv', index=False)
pd.DataFrame(val_confusion_matrix).to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomamunka/Results/val_confusion_matrix_xlnet.csv', index=False)
pd.DataFrame(test_confusion_matrix).to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomamunka/Results/test_confusion_matrix_xlnet.csv', index=False)

# Save ROC-AUC scores to CSV file
roc_auc_df = pd.DataFrame({'Dataset': ['Train', 'Validation', 'Test'],
                           'ROC-AUC': [train_roc_auc, val_roc_auc, test_roc_auc]})
roc_auc_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomamunka/Results/roc_auc_scores_xlnet.csv', index=False)

import matplotlib.pyplot as plt

# Plot training and validation loss
plt.figure(figsize=(10, 4))
plt.plot(training_losses, label='Training Loss')
plt.plot(validation_losses, label='Validation Loss')
plt.title('Training & Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()