# **MODEL TRAINING**

In [3]:
! pip install numpy matplotlib pandas scikit-learn seaborn tqdm torch transformers



In [4]:
import numpy as np
import matplotlib.pyplot as plt

import os
import gc
import pandas as pd
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import seaborn as sns


import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from sklearn.preprocessing import LabelEncoder
from transformers import AdamW, BertForSequenceClassification, BertTokenizer,  DistilBertTokenizer, DistilBertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from sklearn.metrics import mean_absolute_error, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from torch.nn.functional import softmax

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score

  from .autonotebook import tqdm as notebook_tqdm


# *functions for calculate metrics*

In [5]:
def metrics(model, dataloader):
    model.eval()  # Set the model to evaluation mode

    true_labels = []
    predictions = []

    with torch.no_grad():
        for batch_data, batch_masks, batch_labels in dataloader:
            outputs = model(batch_data, attention_mask=batch_masks)
            logits = outputs.logits
            probabilities = softmax(logits, dim=1)  # Convert logits to probabilities
            pred_labels = torch.argmax(probabilities, dim=1)
            
            true_labels.extend(batch_labels.cpu().numpy())
            predictions.extend(pred_labels.cpu().numpy())
            

    # Convert lists to numpy arrays for use with scikit-learn metrics
    true_labels = np.array(true_labels)
    predictions = np.array(predictions)

    # Assuming binary classification for AUC-ROC; adapt as needed for multi-class
    # probabilities_pos_class = probabilities[:, 1]  # Probabilities for the positive class

    # auc_roc = roc_auc_score(true_labels, probabilities_pos_class)
    f1 = f1_score(true_labels, predictions, average='weighted')  # Use 'binary' for binary classification
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')

    # Print the metrics
    print(f'F1 Score: {f1}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    # print(f'AUC-ROC: {auc_roc}')  # Uncomment if applicable

# *functions for model training and plotting metrics*

In [6]:
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    train_loss = 0
    all_predictions = []
    all_labels = []
    for batch_data, batch_masks, batch_labels in tqdm(dataloader):
        outputs = model(batch_data.to(device), attention_mask=batch_masks.to(device), labels=batch_labels.to(device))
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        predicted = torch.argmax(outputs.logits, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

    precision = precision_score(all_labels, all_predictions, average='weighted')
    return train_loss / len(dataloader), precision

def evaluate(model, dataloader, device):
    model.eval()
    eval_loss = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch_data, batch_masks, batch_labels in tqdm(dataloader):
            outputs = model(batch_data.to(device), attention_mask=batch_masks.to(device), labels=batch_labels.to(device))
            loss = outputs.loss
            eval_loss += loss.item()

            predicted = torch.argmax(outputs.logits, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())

    precision = precision_score(all_labels, all_predictions, average='weighted')
    return eval_loss / len(dataloader), precision

def plot_metrics(epochs, train_loss_values, val_loss_values, train_precision_values, val_precision_values):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_loss_values, 'b-', label='Train Loss')
    plt.plot(epochs, val_loss_values, 'r-', label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Train vs Validation Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_precision_values, 'b-', label='Train Precision')
    plt.plot(epochs, val_precision_values, 'r-', label='Validation Precision')
    plt.xlabel('Epoch')
    plt.ylabel('Precision')
    plt.title('Train vs Validation Precision')
    plt.legend()

    plt.tight_layout()
    plt.show()



# *functions for data separation*

In [7]:
def data_separation(data_for_separate, parametr):
    def final_indexes(df, param):
        result_indexes = df.groupby(param).apply(
            lambda x: x.groupby('name').apply(
                lambda y: y.sample(frac=0.1, replace=True, random_state=42) if len(y) > 1 else y
            )
        ).index.get_level_values(-1)
        return result_indexes

    final_indexes_set = set(final_indexes(data_for_separate, parametr))

    train_mask = ~data_for_separate.index.isin(final_indexes_set)

    train = data_for_separate[train_mask]

    final = data_for_separate.loc[list(final_indexes_set)]

    print(train.shape)
    print(final.shape)
    return train, final

    

# *functions for getting dataloders*

In [8]:
def ids_to_tokens(ids, tokenizer):
    token = tokenizer.convert_ids_to_tokens(ids)
    return token

def create_attention_masks(input_ids):
    return [[float(i > 0) for i in seq] for seq in tqdm(input_ids)]

In [9]:
def get_dataloders(df, PARAMETR_FOR_PREPARE, device):
    X = [eval(x) for x in tqdm(df['text_chunk'].tolist())]  # Features (tokenized chunks)
    y = df[PARAMETR_FOR_PREPARE].values  # Labels
    # Separating the train set from the dataset. Dividing the remainder into test and validation
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

    # Converting X into a torch tensor
    train_inputs = torch.tensor(X_train)
    validation_inputs = torch.tensor(X_val)
    test_inputs = torch.tensor(X_test)

    # Converting y into a numerical representation using the trained label encoder
    train_labels = torch.tensor(y_train, dtype=torch.long)
    validation_labels = torch.tensor(y_val, dtype=torch.long)
    test_labels = torch.tensor(y_test, dtype=torch.long)

    # Creating mask tensors
    train_masks = torch.tensor(create_attention_masks(X_train))
    validation_masks = torch.tensor(create_attention_masks(X_val))
    test_masks = torch.tensor(create_attention_masks(X_test))

    #Moving everything to GPU
    train_inputs = train_inputs.to(device)
    train_masks = train_masks.to(device)
    train_labels = train_labels.to(device)

    validation_inputs = validation_inputs.to(device)
    validation_masks = validation_masks.to(device)
    validation_labels = validation_labels.to(device)

    test_inputs = test_inputs.to(device)
    test_masks = test_masks.to(device)
    test_labels = test_labels.to(device) 


    b_s = 16  # batch

    # Making a tensor dataset 
    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    val_dataset = TensorDataset(validation_inputs, validation_masks, validation_labels)
    test_dataset = TensorDataset(test_inputs, test_masks, test_labels)

    # Creating a dataloader
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=b_s)
    validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=b_s)
    test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=b_s)

    return train_dataloader, validation_dataloader, test_dataloader

# *functions for training model*

In [10]:
def train(model, optimizer, device, train_dataloader, validation_dataloader, num_epoch, PATH_to_save):
    train_loss_values, val_loss_values = [], []
    train_precision_values, val_precision_values = [], []
    best_val_precision = float('-inf')  

    for epoch in range(num_epoch):
        print(epoch + 1)
        train_loss, train_precision = train_epoch(model, train_dataloader, optimizer, device)
        val_loss, val_precision = evaluate(model, validation_dataloader, device)

        train_loss_values.append(train_loss)
        val_loss_values.append(val_loss)
        train_precision_values.append(train_precision)
        val_precision_values.append(val_precision)

        plot_metrics(range(1, epoch + 2), train_loss_values, val_loss_values, train_precision_values, val_precision_values)
        print(f'Epoch {epoch + 1}/{num_epoch}, Train Precision: {train_precision}, Validation Precision: {val_precision}')

        torch.save(model.state_dict(), f'{PATH_to_save}_{epoch}.pth')

        if val_precision > best_val_precision:
            best_val_precision = val_precision
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement == 2:
            print("Validation performance did not improve for 2 consecutive epochs. Stopping training.")
            break

    


# *functions for create model*

In [11]:
def model_create(model_name, tokenizer_name, classifire_name, parametr, df, device):
    tokenizer = tokenizer_name.from_pretrained(model_name)
    model = classifire_name.from_pretrained(
        model_name,
        num_labels=len(set(df[parametr])),  # Adjust based on the number of output labels
        output_attentions=False,
        output_hidden_states=False
    )
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    return model, tokenizer, optimizer
    

## **MODEL TRAINING**

In [12]:
device = 'cuda'

model names

In [13]:
distilbert_name = 'distilbert-base-uncased'
bert_name = 'bert-large-uncased'
roberta_name = 'roberta-base'

classifire names

In [14]:
bert_classifire = BertForSequenceClassification
distilbert_classifire = DistilBertForSequenceClassification
roberta_classifire = RobertaForSequenceClassification


tokenizers

In [15]:
bert_tokenizer = BertTokenizer
distilbert_tokenizer = DistilBertTokenizer
roberta_tokenizer = RobertaTokenizer

# 1) 2728

In [17]:
PATH_distilbert = 'D://DL_coursework//task1//2728//distilbert_2728.csv'
PATH_bert = 'D://DL_coursework//task1//2728//bert_2728.csv'
PATH_roberta = 'D://DL_coursework//task1//2728//roberta_2728.csv'
# PATH = '/kaggle/input/1task-2728-tokenized/distilbert.csv'

distilbert_ids = pd.read_csv(PATH_distilbert)
bert_ids = pd.read_csv(PATH_bert)
roberta_ids = pd.read_csv(PATH_roberta)

distilbert_ids.dropna(inplace=True)
bert_ids.dropna(inplace=True)
roberta_ids.dropna(inplace=True)

print(distilbert_ids.shape)
print(bert_ids.shape)
print(roberta_ids.shape)

(428716, 5)
(428716, 5)
(463530, 5)


# fact

In [18]:
PARAMETR_FOR_PREPARE = 'fact'

# distilbert

In [None]:
distilbert_ids_train, distilbert_ids_final = data_separation(distilbert_ids, PARAMETR_FOR_PREPARE)

In [None]:
train_dataloader, validation_dataloader, test_dataloader = get_dataloders(distilbert_ids_train, PARAMETR_FOR_PREPARE, device)

In [None]:
model, tokenizer, optimizer = model_create(distilbert_name, distilbert_tokenizer, distilbert_classifire, PARAMETR_FOR_PREPARE, distilbert_ids_train, device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
PATHTOSAVE = ''

In [None]:
train(model, optimizer, device, train_dataloader, validation_dataloader, 7, PATHTOSAVE)

# bert

In [None]:
bert_ids_train, bert_ids_final = data_separation(bert_ids, PARAMETR_FOR_PREPARE)

In [None]:
train_dataloader, validation_dataloader, test_dataloader = get_dataloders(bert_ids_train, PARAMETR_FOR_PREPARE, device)

In [None]:
model, tokenizer, optimizer = model_create(bert_name, bert_tokenizer, bert_classifire, PARAMETR_FOR_PREPARE, bert_ids_train, device)

In [None]:
PATHTOSAVE = ''

In [None]:
train(model, optimizer, device, train_dataloader, validation_dataloader, 7, PATHTOSAVE)

# roberta

In [None]:
roberta_ids_train, roberta_ids_train, roberta_ids_final = data_separation(roberta_ids, PARAMETR_FOR_PREPARE)

In [None]:
train_dataloader, validation_dataloader, test_dataloader = get_dataloders(roberta_ids_train, PARAMETR_FOR_PREPARE, device)

In [None]:
model, tokenizer, optimizer = model_create(roberta_name, roberta_tokenizer, roberta_classifire, PARAMETR_FOR_PREPARE, roberta_ids_train, device)

In [None]:
PATHTOSAVE = ''

In [None]:
train(model, optimizer, device, train_dataloader, validation_dataloader, 7, PATHTOSAVE)