In [1]:
import os

os.chdir("../")

In [2]:
from src.constants import (
    DATA_PATH,
    USE_SAMPLE,
    FREEZE_EMBEDDINGS,
    FREEZE_LAYERS,
    MODEL_NAME,
    MODEL_DIR,
    CHECKPOINT_DIR,
    OUTPUT_DIR,
    SEED,
    N_SPLITS,
    REMOVE_COLS,
    FOLDING,
)
from src.params import (
    MODEL_NAME,MAX_INPUT_TRAIN,MAX_INPUT_VAL
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Importing Libraries
import sys
import shutil

# Data Handling
import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import Optional, Union
from datasets import Dataset
import matplotlib.pyplot as plt
# Statistics & Mathematics
import scipy.stats as stats
from scipy.stats import shapiro, skew
import math
# RFECV for feature selection
from sklearn.feature_selection import RFECV

# Machine Learning Pipeline & process
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing data
from sklearn.preprocessing import (
    RobustScaler,
    StandardScaler,
    QuantileTransformer,
    FunctionTransformer,
)
from sklearn.compose import ColumnTransformer
# Model Selection for Cross Validation
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
# Machine Learning metrics
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    cohen_kappa_score,
    make_scorer,
)
# ML regressors
from sklearn.linear_model import HuberRegressor, RANSACRegressor, TheilSenRegressor
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    StackingRegressor,
    AdaBoostRegressor,
    RandomForestRegressor,
)
# ML classifiers
from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    AdaBoostClassifier,
    RandomForestClassifier,
)
from sklearn.ensemble import StackingClassifier, VotingClassifier
# Clustering model
from sklearn.cluster import KMeans
# Randomizer
import random
# Encoder of categorical variables
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold
# Importing HuggingFace's Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForMultipleChoice,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)
from transformers.tokenization_utils_base import (
    PreTrainedTokenizerBase,
    PaddingStrategy,
)
# PyTorch
import torch
# Hiding warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
from src.utils import preprocess_wrapper, DataCollatorForMultipleChoice,setup_device,create_option_mappings
from src.metrics import compute_metrics, competition_score, predictions_to_map_output

In [5]:
device= setup_device()

GPU is not available


In [6]:
# synthetic data generated and used by severale users in the competition
df = pd.read_csv(os.path.join(DATA_PATH, "train_with_context2.csv"))  # Loading data
# Concatenating original dataframe to extra dataframes
augmented_df = pd.read_csv(os.path.join(DATA_PATH, "all_12_with_context2.csv"))
if "id" in df.columns:
    df = df.drop("id", axis=1)  # Dropping 'Id' columns


In [7]:
augmented_df = augmented_df.loc[~augmented_df.prompt.isin(df.prompt)].dropna()
augmented_df = augmented_df.drop_duplicates(subset="prompt")
augmented_df = augmented_df[
    [
        "prompt",
        "context",
        "A",
        "B",
        "C",
        "D",
        "E",
        "answer",
    ]
]

# Creating training and validation sets
train_df = augmented_df
val_df = df
train_df.head(3)


Unnamed: 0,prompt,context,A,B,C,D,E,answer
0,"In relation to Eunice Fay McKenzie's career, w...","Eunice Fay McKenzie (February 19, 1918 – April...",McKenzie showcased her singing talents in nume...,McKenzie is primarily remembered for her starr...,McKenzie gained recognition for her role as a ...,McKenzie's collaborations with director Blake ...,McKenzie's successful career in sound films co...,B
1,How does Modified Newtonian Dynamics (MOND) im...,The presence of a clustered thick disk-like co...,MOND is a theory that increases the discrepanc...,MOND explains the missing baryonic mass in gal...,MOND is a theory that reduces the observed mis...,MOND is a theory that eliminates the observed ...,MOND's impact on the observed missing baryonic...,E
2,Which of the following statements accurately d...,Woody Hartman is a retired American soccer goa...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,Ray Montgomerie is a former footballer who pla...,B


In [8]:
folding = FOLDING
n_splits = N_SPLITS
if folding:
    # Crear una instancia de KFold
    kfold = StratifiedKFold(n_splits=n_splits, random_state=21, shuffle=True)

    for fold, (train_indice, test_indice) in enumerate(kfold.split(augmented_df, augmented_df.answer)):
        if fold == 3:
            print(fold)

            train_df = augmented_df.iloc[train_indice]
print(train_df.shape)
train_df = train_df.fillna("MASK_NAS")
train_df = train_df.loc[~train_df.prompt.isin(df.prompt)]

print(train_df.loc[train_df.prompt.isin(df.prompt)].shape)
train_df = train_df.drop_duplicates(subset="prompt")
print(train_df.shape)

(40619, 8)
(0, 8)
(40619, 8)


In [9]:
# Converting dataframes into datasets
if USE_SAMPLE:
    train_ds = Dataset.from_pandas(train_df.iloc[:100])
else:
    train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

train_df["all"] = train_df["prompt"] + train_df["A"] + train_df["B"] + train_df["C"] + train_df["D"] + train_df["E"]
option_to_index, index_to_option = create_option_mappings()

print("\nTrain Dataset:\n")
print(train_ds)
print("\nValidation Dataset:\n")
print(val_ds)


Train Dataset:

Dataset({
    features: ['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer', '__index_level_0__'],
    num_rows: 100
})

Validation Dataset:

Dataset({
    features: ['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'],
    num_rows: 200
})


In [10]:
model_name = MODEL_NAME 
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Instantiating model,
model = AutoModelForMultipleChoice.from_pretrained(model_name)
model = model.to(device)  # GPU0


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMultipleChoice: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTrainin

In [11]:
for param in model.parameters():
    if not param.requires_grad:
        print(param.requires_grad)
    param.requires_grad = True


In [12]:
## FREZZE EMBEDDINGS TO SPEED UP AND IMPROVE FINETUNNING
if FREEZE_EMBEDDINGS:
    print("Freezing embeddings.")
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
if FREEZE_LAYERS > 0:
    print(f"Freezing {FREEZE_LAYERS} layers.")
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

Freezing embeddings.
Freezing 18 layers.


In [13]:
model_dir = MODEL_DIR # Directory to save model and files
output_dir = model_dir
checkpoint_dir = CHECKPOINT_DIR
remove_cols=REMOVE_COLS 
# Tokenizing train Dataset
r_cols = [e for e in  remove_cols  if e in train_ds.features ]
tokenized_train_ds = train_ds.map(preprocess_wrapper(tokenizer,  max_input=MAX_INPUT_TRAIN), batched=False, remove_columns=r_cols)


Map: 100%|██████████| 100/100 [00:04<00:00, 20.99 examples/s]


In [None]:
#Model finetunning
r_cols = [e for e in remove_cols if e in val_ds.features ]
tokenized_val_ds = val_ds.map(preprocess_wrapper(tokenizer,max_input=MAX_INPUT_VAL), batched=False, remove_columns=r_cols)

# Defining parameters
training_args = TrainingArguments(
    output_dir=checkpoint_dir,
    warmup_ratio=0.8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=5,
    learning_rate=5e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    # gradient_accumulation_steps = 2,
    gradient_checkpointing=True,
    num_train_epochs=20,
    # weight_decay=0.01,
    metric_for_best_model="eval_loss",
    report_to="none",
)
# Defining Trainer to train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.01)],
)


print(trainer.evaluate())
trainer.train()

Map: 100%|██████████| 200/200 [00:07<00:00, 27.87 examples/s]


In [None]:
model.save_pretrained(model_dir)
# Save the tokenizer as well
tokenizer.save_pretrained(model_dir)

('finetuned_deberta\\tokenizer_config.json',
 'finetuned_deberta\\special_tokens_map.json',
 'finetuned_deberta\\spm.model',
 'finetuned_deberta\\added_tokens.json',
 'finetuned_deberta\\tokenizer.json')