In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_curve, roc_curve, auc, f1_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_scheduler
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pickle
from tqdm import tqdm
import torch.multiprocessing as mp
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

In [None]:
def load_data(dataset_path, device):
    dataset = pd.read_csv(dataset_path)
    dataset = dataset.dropna()
    dataset['Severity'] = dataset['Severity'].map({'trivial': 0, 'minor': 0, 'normal': 0, 'major': 1, 'critical': 1, 'blocker': 1})
    dataset['Summary'] = dataset['Components'].astype(str) + ' \n' + dataset['Summary']
    X = dataset['Summary']
    y = dataset['Severity']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    train_data = pd.DataFrame({'Summary': X_train, 'Severity': y_train})
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    class_weights = torch.tensor([1.0, float(len(y_train[y_train == 0])) / float(len(y_train[y_train == 1]))])
    class_weights = class_weights.to(device)

    class_counts = train_data['Severity'].value_counts()
    target_count = class_counts.median()

    model_name = "roberta-base"
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    #config = RobertaConfig.from_pretrained('roberta-base', num_labels=6)
    model = RobertaForSequenceClassification.from_pretrained(model_name)#, config=config)

    train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512, return_tensors='pt', return_attention_mask=True)
    test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512, return_tensors='pt', return_attention_mask=True)

    train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(list(y_train)))
    test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(list(y_test)))

    return train_dataset, test_dataset, class_weights

In [None]:
def visualize_results(true_labels, predictions, training_data_save_path):
    # Evaluate the model
    accuracy = accuracy_score(true_labels, predictions)
    classification_rep = classification_report(true_labels, predictions, target_names=['non-severe','severe'])
    f1 = f1_score(true_labels, predictions, average='weighted')  # Calculate F1 score

    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(classification_rep)

    # Visualizations
    # Confusion Matrix
    conf_matrix = confusion_matrix(true_labels, predictions)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

    # Class Imbalance Visualization
    sns.countplot(x=true_labels)
    plt.title('Class Distribution')
    plt.show()

    # Assuming true_labels and predictions are one-hot encoded
    #true_labels_bin = label_binarize(true_labels, classes=[0, 1, 2, 3, 4, 5])
    #predictions_bin = label_binarize(predictions, classes=[0, 1, 2, 3, 4, 5])

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(true_labels, predictions)
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(true_labels, predictions)
    plt.plot(fpr, tpr, marker='.')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()

In [None]:
def evaluate_and_plot(model, test_dataloader, device, model_save_path, training_data_save_path):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        progress_bar = tqdm(test_dataloader, desc='Evaluating', leave=False)
        for batch in progress_bar:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Save the true labels and predictions for later analysis if needed
    evaluation_results = {
        'true_labels': true_labels,
        'predictions': predictions
    }

    with open(f"evaluation_results_{model_save_path}", 'wb') as eval_file:
        pickle.dump(evaluation_results, eval_file)

    # Visualizations
    visualize_results(true_labels, predictions, training_data_save_path)

In [None]:
def train_model_parallel(model, train_dataset, test_dataset, device, optimizer, model_save_path, training_data_save_path, class_weights, epoch):
    model.to(device)
    #model = torch.nn.DataParallel(model)  # Wrap the model for parallel training
    train_batch_size = 32
    test_batch_size = 64

    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=train_batch_size)
    test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=test_batch_size)
    
    epochs = epoch
    total_steps = len(train_dataloader) * epochs
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    best_val_loss = float('inf')
    best_epoch = 0
    no_improvement = 0


    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps,
    )

    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

    model.train()

    steps = []
    losses = []

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
        for step, batch in progress_bar:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss = loss.mean()  # Take the mean over the batch
            loss.backward()
            optimizer.step()
            scheduler.step()

            steps.append(len(steps) + 1)
            losses.append(loss.item())

    # Save the trained model state dictionary along with additional information
    model_state = {
        'model': model.state_dict(),  # Extract the model from DataParallel
        'test_dataloader': test_dataloader,
        'train_dataloader': train_dataloader,
        'device': device,
        'steps': steps,
        'losses': losses,
        'class_weights': class_weights
    }

    with open(model_save_path, 'wb') as model_file:
        pickle.dump(model_state, model_file)

    training_data = {
        'steps': steps,
        'losses': losses
    }

    with open(training_data_save_path, 'wb') as training_data_file:
        pickle.dump(training_data, training_data_file)
        
    # Evaluation and plotting
    evaluate_and_plot(model, test_dataloader, device, model_save_path, training_data_save_path)

In [None]:
def main_parallel(dataset_path, model_save_path, training_data_save_path, gpu_id, epoch):
    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
    train_dataset, test_dataset, class_weights = load_data(dataset_path, device)  # Load configuration for each dataset

    # Create a new model instance for each GPU
    model = RobertaForSequenceClassification.from_pretrained('roberta-base')

    
    optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-3)
    train_model_parallel(model, train_dataset, test_dataset, device, optimizer, model_save_path, training_data_save_path, class_weights, epoch)

# def main_parallel(dataset_path, model_save_path, training_data_save_path, gpu_id, epoch):
#     device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
#     train_dataset, test_dataset, class_weights = load_data(dataset_path, device)
    
#     model = RobertaForSequenceClassification.from_pretrained('roberta-base')
#     optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-3)
    
#     best_epoch = train_model_parallel(model, train_dataset, test_dataset, device, optimizer, model_save_path, training_data_save_path, class_weights, epoch)
#     print(f"Best epoch for dataset {dataset_path}: {best_epoch}")


if __name__ == "__main__":
    dataset_paths = [{'path':"E:/Project Dataset/Final_dataset/bugzilla_dataset.csv",
                       'epoch':15,
                       'name':'bugzilla'},
                       {'path':"E:/Project Dataset/Final_dataset/Core_dataset.csv",
                       'epoch':8,
                       'name':'Core'},
                       {'path':"E:/Project Dataset/Final_dataset/Firefox_dataset.csv",
                       'epoch':8,
                       'name':'Firefox'},
                       {'path':"E:/Project Dataset/Final_dataset/Thunderbird_dataset.csv",
                       'epoch':10,
                       'name':'Thunderbird'}]
    gpus = [0, 1]  # Specify GPU ids

    #processes = []

    for i, dataset_path in enumerate(dataset_paths):
        model_save_path = f"roberta_bug_severity_model_{dataset_path['name']}.pkl"
        training_data_save_path = f"training_data_{dataset_path['name']}.pkl"
        gpu_id = gpus[i % len(gpus)]
        main_parallel(dataset_path['path'], model_save_path, training_data_save_path, gpu_id, dataset_path['epoch'])
