## Importing Libraries
To download all the required libraries, run the following pip command, also you can find a requirement.txt file for the same.

In [1]:
# pip install pandas scikit-learn tensorflow transformers matplotlib numpy

In [None]:
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.models import load_model
# from sklearn.metrics import precision_score, recall_score, f1_score
# import pandas as pd
# import numpy as np
# import os
# Data handling and preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# TensorFlow and Keras for building models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

# PyTorch and Transformers for BERT
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


## Exploration and Preprocesssing

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
dataset_path = './Problem_Dataset.csv'
data = pd.read_csv(dataset_path)

# Distribution of 'Type' column
plt.figure(figsize=(8, 4))
sns.countplot(x='Type', data=data)
plt.title('Distribution of Type')
plt.show()


#### Each type (B3, B4, A3, B1, B2, A1, A2) has roughly the same count. This indicates that the dataset is balanced in terms of the number of samples for each type. there is no class imbalance.

In [None]:
# Text length analysis
data['text_length'] = data['Obs'].apply(len)
plt.figure(figsize=(8, 4))
sns.histplot(data['text_length'], bins=20, kde=True)
plt.title('Distribution of Text Length in Obs')
plt.show()

#### This is a histogram with a kernel density estimate(KDE) line which shows bell shaped curve which shows a peak around 120 whixh tells that most of the samples are 120 charachters long, this information can be used to define sequence length

#### The Dataset is well balance as it has equal number of positive instances

In [None]:
# Parameters
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

# Preprocessing the data
def preprocess_data(df):
    """
    Preprocessing the Input Data, here the function helps to tokenize the text,
    helps in padding our sentences sequence and splitrting it in Training sets and Validation Sets

    Parameters:
    df : The function takes df as argument on the preprocessing is meant to be done

    Returns:
    X_train : Training data which consists of padded sequence.
    X_val : Validation data which consists of padded sequence.
    y_train : Training labels with binary colums A1 to B4
    y_val : Validation labels with binary colums A1 to B4
    tokenizer : This is fitted tokenizer which will transfor the data as passed.
    """
    # During tokenization we need to limit number of unique words to be considered
    # initializing a tokenizer having maximum vocabulary size (which is defined globally)
    tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
    
    # Fitting it on Obs column of the dataset
    tokenizer.fit_on_texts(df['Obs'])
    # We will need to convert the text data into integers
    # here this method will assign an integer value
    sequences = tokenizer.texts_to_sequences(df['Obs'])
    
    # Pad sequences to ensure uniform input length
    X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    
    # Extract labels
    labels = df[['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'B4']]
    
    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, labels, test_size=0.2, random_state=42)
    
    return X_train, X_val, y_train, y_val, tokenizer

## Building a LSTM based Model

In [None]:
def build_model(input_length, vocab_size, num_labels):
    """
    This function is for building and compiling LSTM architecture based Model.

    Parameters:
    input_length : It defines the length of input sequence, number of words in each input
    vocab_size : It difines the number of unique words in data.
    num_labels (int): The number of labels or classes for the multi-label classification task.
    
    Return:
    model : returns a sequential model which can be further used for training.
    """

    # Initializing a sequential model
    model = Sequential()
    # Here embedding layer is used to convert each word into dense vector of fixed size
    model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=input_length))
    # Using a LSTM model, here 128 refers to 128 neurons or LSTM units
    # Adding return_sequence as true to return full sequence as outputs for the following LSTM layer
    model.add(LSTM(128, return_sequences=True))
    # To avoid overfitting, adding a dropout layer and dropping 20% of neurons 
    model.add(Dropout(0.2))
    # Here by default the input_sequence is false as only last output in the sequence is needed
    model.add(LSTM(64))
    model.add(Dropout(0.2))
    model.add(Dense(num_labels, activation='sigmoid'))
    # Using binary crossentropy loss function and adam optimizer
    # To update weights based on the loss.
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

## Binary Classification Model

In [None]:
# Train models for each binary label
def train_binary_models(X_train, y_train, X_val, y_val, input_length, vocab_size,model_dir):
    """
    Trains binary classification models for each label in the dataset and saves them to disk.

    Parameters:
    X_train (np.ndarray): Training data consisting of padded sequences.
    y_train (pd.DataFrame): Training labels, where each column represents a binary label.
    X_val (np.ndarray): Validation data consisting of padded sequences.
    y_val (pd.DataFrame): Validation labels corresponding to binary columns in y_train.
    input_length (int): The length of the input sequences (number of words per sequence).
    vocab_size (int): The size of the vocabulary (number of unique words).
    model_dir (str): The directory where the trained models will be saved.

    Returns:
    models (dict): A dictionary where keys are label names and values are the corresponding trained models.
    """
    os.makedirs(model_dir, exist_ok=True)
    # Initializing an empty dictionary to save trained models for each label
    models = {}
    # Looping through each label in training data
    for label in y_train.columns:
        print(f"Training model for {label}...")
        model = build_model(input_length, vocab_size,1)
        # Train for 5 epochs
        # Stop training early if the validation loss doesn't improve for 2 consecutive epochs.
        model.fit(X_train, y_train[label], epochs=5, validation_data=(X_val, y_val[label]), 
                  callbacks=[EarlyStopping(patience=2)], batch_size=32)
        # # Save the trained model to disk in the specified directory.
        model.save(os.path.join(model_dir, f"{label}_model.h5"))  
        # Store the trained model in the `models` dictionary, using the label as the key.
        models[label] = model  
    return models

# Function to load models from the directory
def load_models(model_dir, labels):
    """
    Loads pre-trained binary classification models from disk.

    Parameters:
    model_dir (str): The directory where the trained models are stored.
    labels (list of str): A list of label names corresponding to the models to be loaded.

    Returns:
    models (dict): A dictionary where keys are label names and values are the corresponding loaded models.
    """
    models = {}
    for label in labels:
        model_path = os.path.join(model_dir, f"{label}_model.h5")
        models[label] = load_model(model_path)
    return models

## Ensemble Model

In [None]:
def predict_ensemble(models, X):
    """
    Generates ensemble predictions from multiple binary classification models.

    Parameters:
    models (dict): A dictionary where keys are label names and values are trained models.
    X (np.ndarray): The input data on which predictions will be made.

    Returns:
    final_predictions (np.ndarray): A 2D array with binary predictions (0 or 1) for each label.
    """
    ensemble_predictions = np.zeros((X.shape[0], len(models))) 
    # Loop through each model and its corresponding label.
    for i, (label, model) in enumerate(models.items()):
        # Make predictions using the model for the current label.
        # Also using squeeze() to make sure predictions are 1D
        preds = model.predict(X).squeeze()  
        ensemble_predictions[:, i] = preds  
    # Convert the ensemble predictions to binary (0 or 1) based on a threshold of 0.5.
    final_predictions = (ensemble_predictions >= 0.5).astype(int)
    
    return final_predictions

# Calculate metrics for each label
def calculate_metrics(final_predictions, y_true, labels):
    """
    Calculates precision, recall, and F1-score for each label.

    Parameters:
    final_predictions (np.ndarray): The predicted binary labels.
    y_true (pd.DataFrame): The true labels corresponding to each label.
    labels (list of str): The list of label names.

    Returns:
    metrics (dict): A dictionary where each label has a sub-dictionary with precision, recall, and F1-score.
    """
    metrics = {}
    # Loop through each label and its corresponding index.
    for i, label in enumerate(labels):
        y_pred = final_predictions[:, i]
        y_actual = y_true[label].values
        precision = precision_score(y_actual, y_pred)
        recall = recall_score(y_actual, y_pred)
        f1 = f1_score(y_actual, y_pred)
        metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
    return metrics


def save_metrics(metrics, file_path):
    """
    Save the provided metrics dictionary to a CSV file.

    Parameters:
    - metrics (dict): A dictionary where each key is a label, and each value is another dictionary
      containing metric names and their corresponding values.
    
    - file_path (str): The path to the CSV file where the DataFrame will be saved.
      Example: "output/metrics.csv"

    Returns:
    - None: The function does not return any value. It writes the metrics data to a CSV file specified by `file_path`.
    """
    # Prepare data for DataFrame
    data = []
    for label, metric in metrics.items():
        for metric_name, value in metric.items():
            data.append([label, metric_name, value])
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=["Label", "Metric", "Value"])
    
    # Save DataFrame to CSV
    df.to_csv(file_path, index=False)

# Save predictions to a CSV file
def save_predictions(predictions, original_data, labels, file_path):
    """
    Saves the ensemble predictions to a CSV file.

    Parameters:
    predictions (np.ndarray): The binary predictions generated by the ensemble.
    original_data (pd.DataFrame): The original data containing columns like 'sID' and 'Obs'.
    labels (list of str): The list of label names corresponding to the predictions.
    file_path (str): The file path where the predictions will be saved.

    Returns:
    None
    """
    pred_df = pd.DataFrame(predictions, columns=labels)
    results_df = pd.concat([original_data[['sID', 'Obs']], pred_df], axis=1)
    results_df.to_csv(file_path, index=False)


# Main function to load models, make predictions, and evaluate
def main(model_dir, X_val, y_val, labels, metrics_output_file, predictions_output_file):
    """
    Main function to load models, make predictions, evaluate, and save results.

    Parameters:
    model_dir (str): The directory where the trained models are stored.
    X_val (np.ndarray): The validation data on which predictions will be made.
    y_val (pd.DataFrame): The true validation labels.
    labels (list of str): The list of label names.
    metrics_output_file (str): The file path to save the evaluation metrics.
    predictions_output_file (str): The file path to save the predictions.

    Returns:
    final_predictions (np.ndarray): The binary predictions generated by the ensemble.
    metrics (dict): The calculated precision, recall, and F1-score for each label.
    """
    # Load the models
    models = load_models(model_dir, labels)
    
    # Predict using ensemble
    final_predictions = predict_ensemble(models, X_val)
    
    # Calculate precision, recall, and F1-score
    metrics = calculate_metrics(final_predictions, y_val, labels)
    
    # Save metrics to a file
    save_metrics(metrics, metrics_output_file)
    
    # Save predictions to a file
    save_predictions(final_predictions, df, labels, predictions_output_file)
    
    return final_predictions, metrics

# Define labels
labels = ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'B4']

dataset_path = './Problem_Dataset.csv'
df = pd.read_csv(dataset_path)

# Preprocess the data
X_train, X_val, y_train, y_val, tokenizer = preprocess_data(df)

# Directory where models should be saved
model_dir = './Ensemble/Models'

# Train binary classification models
binary_models = train_binary_models(X_train, y_train, X_val, y_val, X_train.shape[1], len(tokenizer.word_index) + 1, model_dir)

# Output files to save metrics and predictions
metrics_output_file = './Ensemble/metrics.csv'
predictions_output_file = './Ensemble/ensemble_predictions.csv'

# Evaluate the ensemble models and save predictions
final_predictions, ensemble_metrics = main(model_dir, X_val, y_val, labels, metrics_output_file, predictions_output_file)

# Display metrics
for label, metric in ensemble_metrics.items():
    print(f"{label} metrics:")
    print(f"  Precision: {metric['precision']:.4f}")
    print(f"  Recall:    {metric['recall']:.4f}")
    print(f"  F1-Score:  {metric['f1_score']:.4f}\n")

## Multi-Label Classification Model

In [None]:
def build_multi_label_model(input_length, vocab_size, num_labels):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=input_length))
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(64))
    model.add(Dropout(0.2))
    model.add(Dense(num_labels, activation='sigmoid'))  # 7 output units for 7 labels (A1 to B4)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_multi_label_model(X_train, y_train, X_val, y_val, input_length, vocab_size, MultiModel_dir):
    """
    Train a multi-label classification model using the provided training and validation data.

    Parameters:
    X_train (numpy.ndarray): The training input data,  array of sequences.
    y_train (numpy.ndarray): The training labels, array where each row corresponds to a multi-label binary vector.
    X_val (numpy.ndarray): The validation input data, used to evaluate the model during training.
    y_val (numpy.ndarray): The validation labels, used to assess the model's performance during training.
    input_length (int): The length of the input sequences, defining the number of words in each input sequence.
    vocab_size (int): The size of the vocabulary, i.e., the number of unique tokens or words in the input data.
    MultiModel_dir (str): The directory path where the trained model will be saved.

    Returns:
    model : The trained Sequential model.
    """
    os.makedirs(MultiModel_dir, exist_ok=True)
    num_labels = y_train.shape[1]
    # Build the model
    model = build_multi_label_model(input_length, vocab_size, num_labels)
    
    print("Training multi-label model...")
    model.fit(X_train, y_train, epochs=15, validation_data=(X_val, y_val), 
              callbacks=[EarlyStopping(patience=2)], batch_size=32, verbose =1)
    # Save the trained model to the specified directory
    model.save(os.path.join(MultiModel_dir, "multi_label_model.h5")) 
    return model

def predict_multi_label_model(model, X):
    """
    Generate binary predictions from a multi-label classification model.

    Parameters:
    model : The trained model used for making predictions.
    X (numpy.ndarray): The input data for which predictions are to be made. This is a 2D array of sequences.

    Returns:
    final_predictions (numpy.ndarray): A 2D array of binary predictions (0 or 1) for each label.
    """
    # Predict probabilities for each label
    predictions = model.predict(X)
    # Convert probabilities to binary predictions
    final_predictions = (predictions >= 0.4).astype(int)
    return final_predictions

def calculate_multi_label_metrics(final_predictions, y_true):
    """
    Calculate precision, recall, and F1 score for each label in a multi-label classification task.

    Parameters:
    final_predictions (numpy.ndarray): The binary predictions generated by the model, a 2D array where each column corresponds to a label.
    
    y_true (pandas.DataFrame): The true labels for the dataset, where each column represents a label.

    Returns:
    metrics (dict): A dictionary containing precision, recall, and F1 score for each label.
    """
    metrics = {}
    for i, label in enumerate(y_true.columns):
        y_pred = final_predictions[:, i]
        y_actual = y_true[label].values
        precision = precision_score(y_actual, y_pred)
        recall = recall_score(y_actual, y_pred)
        f1 = f1_score(y_actual, y_pred)
        metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
    return metrics

def save_multi_label_metrics(metrics, file_path):
    """
    Save the provided metrics dictionary to a CSV file.

    Parameters:
    - metrics (dict): A dictionary where each key is a label, and each value is another dictionary
      containing metric names and their corresponding values.
    
    - file_path (str): The path to the CSV file where the DataFrame will be saved.
      Example: "output/metrics.csv"

    Returns:
    - None: The function does not return any value. It writes the metrics data to a CSV file specified by `file_path`.
    """
    # Prepare data for DataFrame
    data = []
    for label, metric in metrics.items():
        for metric_name, value in metric.items():
            data.append([label, metric_name, value])
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=["Label", "Metric", "Value"])
    
    # Save DataFrame to CSV
    df.to_csv(file_path, index=False)

def save_multi_label_predictions(predictions, original_data, labels, file_path):
    """
    Save the multi-label predictions alongside original data to a CSV file.

    Parameters:
    predictions (numpy.ndarray): A 2D array of binary predictions (0 or 1) for each label.   
    original_data (pandas.DataFrame): The original dataset, which includes columns such as 'sID' and 'Obs'.
    labels (list): A list of label names corresponding to the columns in the predictions array.
    file_path (str): The path to the CSV file where the predictions and original data will be saved.

    Returns:
    - None: This function does not return any value. It writes the predictions and original data to a CSV file.
    """
    pred_df = pd.DataFrame(predictions, columns=labels)
    results_df = pd.concat([original_data[['sID', 'Obs']], pred_df], axis=1)
    results_df.to_csv(file_path, index=False)


In [None]:
def Multimain(model_dir, X_train, y_train, X_val, y_val, labels, metrics_output_file, predictions_output_file):
    """
    Train a multi-label model, evaluate its performance, and save the results.

    Parameters:
    model_dir (str): Directory where the trained model will be saved.
    X_train (numpy.ndarray): The training input data, a 2D array of sequences.
    y_train (numpy.ndarray): The training labels, a 2D array where each row corresponds to a multi-label binary vector.
    X_val (numpy.ndarray): The validation input data, used to evaluate the model during training.
    y_val (numpy.ndarray): The validation labels, used to assess the model's performance during training.
    labels (list): A list of label names corresponding to the columns in the predictions array.
    metrics_output_file (str): The path to the file where the calculated metrics will be saved.
    predictions_output_file (str): The path to the file where the predictions will be saved.

    Returns:
    final_predictions (numpy.ndarray): A 2D array of binary predictions (0 or 1) for each label.
    metrics (dict): A dictionary containing precision, recall, and F1 score for each label.
    """
    # Train the model
    model = train_multi_label_model(X_train, y_train, X_val, y_val, X_train.shape[1], len(tokenizer.word_index) + 1, MultiModel_dir)
    
    # Predict using the trained model
    final_predictions = predict_multi_label_model(model, X_val)
    
    # Calculate precision, recall, and F1-score
    metrics = calculate_multi_label_metrics(final_predictions, y_val)
    
    # Save metrics to a file
    save_multi_label_metrics(metrics, metrics_output_file)
    
    # Save predictions to a file
    save_multi_label_predictions(final_predictions, df, labels, predictions_output_file)
    
    return final_predictions, metrics

# Define labels
labels = ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'B4']

dataset_path = './Problem_Dataset.csv'
df = pd.read_csv(dataset_path)

# Preprocess the data
X_train, X_val, y_train, y_val, tokenizer = preprocess_data(df)

# Directory where models should be saved
MultiModel_dir = './MultiModel/Models'

# Output files to save metrics and predictions
metrics_output_file = './MultiModel/metrics.csv'
predictions_output_file = './MultiModel/multi_predictions.csv'

# Evaluate the multi-label model and save predictions
final_predictions, multi_label_metrics = Multimain(MultiModel_dir, X_train, y_train, X_val, y_val, labels, metrics_output_file, predictions_output_file)

# Display metrics
for label, metric in multi_label_metrics.items():
    print(f"{label} metrics:")
    print(f"  Precision: {metric['precision']:.4f}")
    print(f"  Recall:    {metric['recall']:.4f}")
    print(f"  F1-Score:  {metric['f1_score']:.4f}\n")



In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

def plot_metrics_line_chart(file_path):
    # Load the CSV into a DataFrame
    df = pd.read_csv(file_path)
    
    # Set up the plot size and style
    plt.figure(figsize=(12, 8))
    sns.set(style="whitegrid")
    
    # Plot each label's metrics as a separate line
    for label in df['Label'].unique():
        label_data = df[df['Label'] == label]
        plt.plot(label_data['Metric'], label_data['Value'], marker='o', label=label)
    
    # Add title and labels
    plt.title('Metrics Comparison Across Labels', fontsize=16)
    plt.xlabel('Metric', fontsize=14)
    plt.ylabel('Value', fontsize=14)
    
    # Show legend
    plt.legend(title='Label')
    
    # Display the plot
    plt.show()


# Example usage
file_path = "Ensemble/metrics.csv"  # Update this path to your actual CSV file path
plot_metrics_line_chart(file_path)


## BERT Transformer

In [None]:
# import torch
# from torch.utils.data import Dataset, DataLoader, random_split
# from transformers import BertTokenizer, BertForSequenceClassification, AdamW
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# import pandas as pd
# from sklearn.model_selection import train_test_split

# Load the dataset
file_path = './Problem_Dataset.csv'
df = pd.read_csv(file_path)

# Preprocessing
sentences = df['Obs'].values
labels = df[['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'B4']].values

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)


#### Here, a custom dataset is to be made which inherits from torch.utils.data.Dataset, which is an abstract class that PyTorch uses to load data. By creating a custom dataset, we can ensure that data is structured and processed in a way that the model can work with.

In [None]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        # Initialize with texts, labels, tokenizer, and max_len parameters
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.texts)

    def __getitem__(self, idx):
        # For a given index, retrieve the corresponding text and label
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text using the tokenizer provided
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False, # Not needed for classification
            padding='max_length',
            truncation=True,
            return_attention_mask=True, # Returning Attention Mask
            return_tensors='pt',
        )
        # Return the text, input_ids, attention_mask, and labels as a dictionary
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

In [None]:
# Initialize the BERT tokenizer using the 'bert-base-uncased' model
# The 'bert-base-uncased' tokenizer converts the input text into tokens compatible with the BERT model.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128
batch_size = 16

# Create data loaders for training and validation
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)

# Define the device: Use GPU if available, otherwise use CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move the model to the appropriate device
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

# Training loop
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}')

# Validation
model.eval()
val_predictions, val_true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        val_predictions.append(logits)
        val_true_labels.append(labels)

# Convert predictions to binary labels
predicted_labels = (torch.sigmoid(torch.cat(val_predictions)) > 0.5).int()
true_labels = torch.cat(val_true_labels).int()

# Calculate accuracy, precision, recall, f1-score on the validation set
accuracy = accuracy_score(true_labels.cpu(), predicted_labels.cpu())
precision, recall, f1, _ = precision_recall_fscore_support(true_labels.cpu(), predicted_labels.cpu(), average='weighted')

print(f'Validation Accuracy: {accuracy}')
print(f'Validation Precision: {precision}')
print(f'Validation Recall: {recall}')
print(f'Validation F1-Score: {f1}')

In [None]:
# Calculate metrics for each label
metrics_data = []
label_names = ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'B4']

for i, label_name in enumerate(label_names):
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels.cpu()[:, i], predicted_labels.cpu()[:, i], average='binary')
    metrics_data.append([label_name, 'precision', precision])
    metrics_data.append([label_name, 'recall', recall])
    metrics_data.append([label_name, 'f1_score', f1])

# Convert metrics data to DataFrame and save as CSV
metrics_df = pd.DataFrame(metrics_data, columns=['Label', 'Metric', 'Value'])
metrics_df.to_csv('label_metrics.csv', index=False)

# Save the predictions in the desired format
predictions_df = pd.DataFrame(predicted_labels.cpu().numpy(), columns=label_names)
predictions_df.insert(0, 'Obs', val_texts)  # Insert the original observations back into the DataFrame
predictions_df.to_csv('predictions.csv', index=False)

In [None]:
model_save_path = 'BERT/model_state_dict.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model state dictionary saved to {model_save_path}')

# Step 2 (Optional): Save the entire model
full_model_save_path = 'BERT/full_model.pth'
torch.save(model, full_model_save_path)
print(f'Entire model saved to {full_model_save_path}')