In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl
%pip install -U datasets

In [None]:
#Importing the required Libraries

#Analysis and data creation
import numpy as np
import pandas as pd
import re
import random

#Modelling
import torch
import torch.nn.functional as F

from transformers import (
    MistralForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding)

from tqdm import tqdm

#Quantization
from datasets import Dataset, DatasetDict,load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model


#KPM Evaluation
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, matthews_corrcoef
import seaborn as sns
import matplotlib.pyplot as plt
import json
import os

2024-09-02 13:26:41.075674: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-02 13:26:41.075788: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-02 13:26:41.211892: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_wandb = user_secrets.get_secret("wandb")

In [4]:
!huggingface-cli login --token $secret_hf


  pid, fd = os.forkpty()


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
model_str = "ilsp/Meltemi-7B-v1"

In [None]:
dataset = load_dataset("Kleo/ArgKP_2021_GR",use_auth_token=True)
dataset

In [7]:
# A function that sets seed for reproducibility
def set_seed(seed_value):
  random.seed(seed_value)
  np.random.seed(seed_value)
  torch.manual_seed(seed_value)
  torch.cuda.manual_seed_all(seed_value)

In [None]:
def clean_text(text):

    # Remove URLs
    url = re.compile(r'https?://\S+|www\.\S+')
    text = url.sub(r'', text)

    # Remove HTML tags
    html = re.compile(r'<.*?>')
    text = html.sub(r'', text)

    # Remove emojis and certain symbols
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

    text = emoji_pattern.sub(r'', text)

    return text

def clean_dataframe(df, columns_to_clean):
    # Remove rows with NaN values
    df.dropna(inplace=True)

    # Remove duplicate rows
    df.drop_duplicates(inplace=True)

    # Apply text cleaning to specified columns
    for column in columns_to_clean:
        df[column] = df[column].astype(str).apply(clean_text)

    # Return the number of rows after cleaning
    return df.shape[0], df


columns_to_clean = ['arg', 'key_point', 'topic']


columns_to_clean = ['arg', 'key_point', 'topic']


In [None]:
# Convert train and validation splits to Pandas DataFrames
train_df = dataset['train'].to_pandas()
dev_df = dataset['validation'].to_pandas()

#A function that reads, cleans, preprocess the train, dev and test data

In [None]:
# Clean the train and validation dataframes
num_train_rows, train_df = clean_dataframe(train_df, columns_to_clean)
print(f'Number of rows in cleaned train dataset: {num_train_rows}')

num_dev_rows, dev_df = clean_dataframe(dev_df, columns_to_clean)
print(f'Number of rows in cleaned validation dataset: {num_dev_rows}')

#num_test_rows, test_df = clean_csv(test_file_path, columns_to_clean)
#print(f'Number of rows after cleaning: {num_test_rows}')

Number of rows after cleaning: 20635
Number of rows after cleaning: 3458
Number of rows after cleaning: 3923
Number of rows after cleaning: 4211


In [10]:
##==================== LABELS OF SUB-TASKS ====================##
label2id = {'NOT-MATCH': 0, 'MATCH': 1}
id2label = {0: 'NOT-MATCH', 1: 'MATCH'}
class_names = ['NOT-MATCH', 'MATCH']

In [None]:
#Merging the Two Columns together, Alternatively you can use them as separate input for the model as well.
train_df["kp_arg"] = 'Keypoint: ' + train_df["key_point"].str.strip() + "; " + 'Επιχείρημα: ' + train_df["arg"].str.strip()
dev_df["kp_arg"] = 'Keypoint: ' + dev_df["key_point"].str.strip() + "; " + 'Επιχείρημα: ' + dev_df["arg"].str.strip()


#Creating another dataframe with only the required columns.
train_merged = train_df[["label","kp_arg"]]
dev_merged=dev_df[["label","kp_arg"]]


In [None]:
# Define a function to calculate the length of each entry in 'arg_kp'
def calculate_length(df, column_name):
    return df[column_name].apply(len).max()

# Calculate the maximum length of 'arg_kp' in each dataset
train_max_length = calculate_length(train_merged, "kp_arg")
dev_max_length = calculate_length(dev_merged, "kp_arg")


# Print the maximum lengths
print("Maximum length in train dataset:", train_max_length)
print("Maximum length in dev dataset:", dev_max_length)



Maximum length in train dataset: 438
Maximum length in dev dataset: 399
Maximum length in test dataset: 440
Maximum length in pred_dev dataset: 399


In [None]:
train_hf_dataset = Dataset.from_pandas(train_merged)
dev_hf_dataset=Dataset.from_pandas(dev_merged)


#compute class weights

In [14]:
train_df.label.value_counts(normalize=True)


label
0    0.793555
1    0.206445
Name: proportion, dtype: float64

In [15]:
class_weights=(1/train_df.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

tensor([0.2064, 0.7936])

In [16]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.0,
    target_modules=[
        "q_proj",
        "v_proj"
    ],
    bias="none",
    task_type="SEQ_CLS",

)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [17]:
# Set seed for reproducibility
set_seed(42)
tokenizer = AutoTokenizer.from_pretrained(model_str,
                                          trust_remote_code=True,
                                          add_bos_token=True,
                                          add_eos_token=True,
                                          use_fast=False
                                         )

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.97M [00:00<?, ?B/s]

In [18]:
#Loading the Model
model_quantized = MistralForSequenceClassification.from_pretrained(model_str, num_labels=train_df.label.nunique(), quantization_config=bnb_config, device_map="auto")


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/504M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at ilsp/Meltemi-7B-v1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model_quantized = prepare_model_for_kbit_training(model_quantized)

In [20]:
#Setting up the LoRA Adapter
model_main = get_peft_model(model_quantized, peft_config)

In [21]:
#Setting the Pretraining_tp to 1 ensures we are using the Linear Layers to the max computation possible
model_main.config.pretraining_tp = 1 #For Us this would be 7B
model_main.config.use_cache = False
#Ensuring the model is aware about the pad token ID
model_main.config.pad_token_id = tokenizer.pad_token_id

In [22]:
model_main.print_trainable_parameters()

trainable params: 3,416,064 || all params: 7,234,998,272 || trainable%: 0.0472


In [23]:
model_main

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): MistralForSequenceClassification(
      (model): MistralModel(
        (embed_tokens): Embedding(61520, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
          

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [25]:
max_length =512         #I have used a length of 512 due to memory concerns you can also use higher ranges or you this->tokenizer.model_max_length to invoke the max length of the model

#Tokenizing the Datasets
def tokenize_function(examples):
    return tokenizer(examples["kp_arg"], padding="max_length", max_length = max_length, truncation=True)

In [None]:
tokenized_train = train_hf_dataset.map(tokenize_function, batched=True)
tokenized_dev = dev_hf_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/20635 [00:00<?, ? examples/s]

Map:   0%|          | 0/3458 [00:00<?, ? examples/s]

Map:   0%|          | 0/3923 [00:00<?, ? examples/s]

Map:   0%|          | 0/4211 [00:00<?, ? examples/s]

In [27]:
tokenized_train = tokenized_train.remove_columns(['kp_arg'])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format("torch")


In [28]:
tokenized_dev= tokenized_dev.remove_columns(['kp_arg'])
tokenized_dev = tokenized_dev.rename_column("label", "labels")
tokenized_dev.set_format("torch")

#Define custom trainer with class weights

In [29]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
def compute_metrics(p):
    """Computes micro-F1 score, macro-F1 score, accuracy on a batch of predictions"""
    logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(logits, axis=1)
    macro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='macro', zero_division=0)
    micro_f1 = f1_score(y_true=p.label_ids, y_pred=preds, average='micro', zero_division=0)
    accuracy = accuracy_score(y_true=p.label_ids, y_pred=preds)
    return {'micro_f1': micro_f1, 'macro_f1': macro_f1, 'accuracy': accuracy}

# A function that calculates all the metrics using the validation/test set
def calculate_metrics(y_true, preds, class_names, save_directory_name):
    print('\nCALCULATING METRICS...')
    
    # Calculate the accuracy of the model
    acc = accuracy_score(y_true, preds)
    # Calculate the Matthews Correlation Coefficient
    mcc = matthews_corrcoef(y_true, preds)
    model_f1_score_micro = f1_score(y_true, preds, average='micro', zero_division=1)
    model_precision_micro = precision_score(y_true, preds, average='micro', zero_division=1)
    model_recall_micro = recall_score(y_true, preds, average='micro', zero_division=1)
    model_f1_score_macro = f1_score(y_true, preds, average='macro', zero_division=1)
    model_precision_macro = precision_score(y_true, preds, average='macro', zero_division=1)
    model_recall_macro = recall_score(y_true, preds, average='macro', zero_division=1)
    
    # Compute precision, recall, F1-score for each class
    precision, recall, fscore, support = score(y_true, preds, zero_division=1)
    
    print(f'Accuracy: {acc}')
    print(f'Micro-F1 Score: {model_f1_score_micro}')
    print(f'Macro-F1 Score: {model_f1_score_macro}') 
    print(f'Macro-Precision Score: {model_precision_macro}')
    print(f'Macro-Recall Score: {model_recall_macro}')
    print(f'Matthews Correlation Coefficient: {mcc}')
    print(f'\nPrecision of each class: {precision}')
    print(f'Recall of each class: {recall}')
    print(f'F1 score of each class: {fscore}')
    print(classification_report(y_true, preds, target_names=class_names, digits=4))
    
    # Compute and plot the confusion matrix
    cm = confusion_matrix(y_true, preds)
    df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
    hmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True Labels')
    plt.xlabel('Predicted Labels')
    plt.savefig(save_directory_name, bbox_inches='tight')
    plt.close()
    
    # Calculate Precision-Recall curve and Average Precision (AP)
    precision_vals, recall_vals, thresholds = precision_recall_curve(y_true, preds)
    average_precision = average_precision_score(y_true, preds)
    
    # Plot Precision-Recall curve with the desired label
    plt.figure()
    plt.plot(recall_vals, precision_vals, color='darkorange', marker='.', 
             label=f'PR Curve (AUC={average_precision:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower right')
    plt.grid(True)
    
    # Save the PR curve plot
    plt.savefig(f"{save_directory_name}_pr_curve.png", bbox_inches='tight')
    plt.close()
    
    print(f'Average Precision (AP): {average_precision:.4f}')
    
    return model_f1_score_macro, model_f1_score_micro, fscore, acc, precision, recall, support

In [31]:
steps = 129
project = "kpm_fine_tuning_weighted"
base_model_name = "meltemi_7b_v1_base"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    max_grad_norm=0.3,
    log_level="debug",
    optim='paged_adamw_32bit',
    lr_scheduler_type="linear",
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=False,
    seed=42,
    warmup_steps=0,
    eval_steps=steps,
    save_steps=steps,
    logging_steps=steps,
    bf16=False,
    gradient_checkpointing=True,
    report_to='none'
)




In [32]:
# Setting up the Trainer API
trainer = CustomTrainer(
    model=model_main,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    class_weights=class_weights,
    compute_metrics= compute_metrics
)

trainer.train('/kaggle/input/checkpoint_645_kp_arg_weighted/transformers/default/1/checkpoint-645')

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
Loading model from /kaggle/input/checkpoint_645_kp_arg_weighted/transformers/default/1/checkpoint-645.
Currently training with a batch size of: 16
***** Running training *****
  Num examples = 20,635
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 645
  Number of trainable parameters = 3,416,064
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 645
  Will skip the first 1 epochs then the first 0 batches in the first epoch.


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./meltemi_7b_v1_base-kpm_fine_tuning_weighted/checkpoint-516 (score: 0.41508758068084717).
Could not locate the best model at ./m

Step,Training Loss,Validation Loss


TrainOutput(global_step=645, training_loss=0.0, metrics={'train_runtime': 0.0128, 'train_samples_per_second': 1607083.15, 'train_steps_per_second': 50233.517, 'total_flos': 4.4265818076217344e+17, 'train_loss': 0.0, 'epoch': 1.0})

Evaluation

In [33]:
#trainer.evaluate(tokenized_dev)

In [None]:
##Preds for dev set (without undecided pairs)

repository_id='./'
dev = trainer.predict(tokenized_dev)
dev_proba = F.softmax(torch.tensor(dev.predictions, dtype=torch.float), dim=1).cpu().numpy()
preds = np.argmax(dev_proba, axis=-1)
labels=tokenized_dev["labels"]
#macro_f1, micro_f1, f1_class, accuracy, precision_class, recall_class, support = calculate_metrics(labels, preds, class_names, f'{repository_id}/gr_dev_set_matrix.png')

#For getting the probs of the positive class only 
#dev_proba_df = pd.DataFrame(dev_proba)
#dev_proba_df=dev_proba_df[1]
#dev_proba_df.head()


***** Running Prediction *****
  Num examples = 3458
  Batch size = 16



CALCULATING METRICS...
Accuracy: 0.8502024291497976
Micro-F1 Score: 0.8502024291497976
Macro-F1 Score: 0.8090058482525189
Macro-Precision Score: 0.7842265825577049
Macro-Recall Score: 0.8697279013231309
Matthews Correlation Coefficient: 0.6483409531081801

Precision of each class: [0.9697099  0.59874327]
Recall of each class: [0.83566176 0.90379404]
F1 score of each class: [0.89770932 0.72030238]
              precision    recall  f1-score   support

   NOT-MATCH     0.9697    0.8357    0.8977      2720
       MATCH     0.5987    0.9038    0.7203       738

    accuracy                         0.8502      3458
   macro avg     0.7842    0.8697    0.8090      3458
weighted avg     0.8905    0.8502    0.8598      3458

Average Precision (AP): 0.5617


In [35]:
#Preds for the dev set (incl. undecided pairs)

#tokenized_pred_dev = tokenized_pred_dev.remove_columns(['kp_arg'])
#pred_dev = trainer.predict(tokenized_pred_dev)
#pred_dev_proba = F.softmax(torch.tensor(pred_dev.predictions, dtype=torch.float), dim=1).cpu().numpy()
#pred_dev_proba_df = pd.DataFrame(pred_dev_proba)
#pred_dev_proba_df=pred_dev_proba_df[1]
#pred_dev_proba_df.head()

In [36]:
##Preds on unlabelled test set

#tokenized_pred_test = tokenized_pred_test.remove_columns(['kp_arg'])
#pred_test = trainer.predict(tokenized_pred_test)
#pred_test_proba = F.softmax(torch.tensor(pred_test.predictions, dtype=torch.float), dim=1).cpu().numpy()
#pred_test_proba_df = pd.DataFrame(pred_test_proba)
#pred_test_proba_df=pred_test_proba_df[1]
#pred_test_proba_df.head()

#KPM Evaluation code

In [37]:
def load_kpm_data(gold_data_dir, subset):

    arguments_file = os.path.join(gold_data_dir, f"arguments_human_translated_{subset}.csv")
    key_points_file = os.path.join(gold_data_dir, f"key_points_human_translated_{subset}.csv")
    labels_file = os.path.join(gold_data_dir, f"labels_{subset}.csv")

    arguments_df = pd.read_csv(arguments_file)
    key_points_df = pd.read_csv(key_points_file)
    labels_file_df = pd.read_csv(labels_file)

    return arguments_df, key_points_df, labels_file_df


def load_predictions(predictions_dir, correct_kp_list):
    arg =[]
    kp = []
    scores = []
    invalid_keypoints = set()
    with open(predictions_dir, "r") as f_in:
        res = json.load(f_in)
        for arg_id, kps in res.items():
            valid_kps = {key: value for key, value in kps.items() if key in correct_kp_list}
            invalid = {key: value for key, value in kps.items() if key not in correct_kp_list}
            for invalid_kp, _ in invalid.items():
                if invalid_kp not in invalid_keypoints:
                    #print(f"key point {invalid_kp} doesn't appear in the key points file and will be ignored")
                    invalid_keypoints.add(invalid_kp)
            if valid_kps:
                best_kp = max(valid_kps.items(), key=lambda x: x[1])
                arg.append(arg_id)
                kp.append(best_kp[0])
                scores.append(best_kp[1])
        #print(f"\tloaded predictions for {len(arg)} arguments")

        return pd.DataFrame({"arg_id" : arg, "key_point_id": kp, "score": scores})

def get_predictions(predictions_file, labels_df, arg_df, kp_df):
    print("\nֿ** loading predictions:")
    arg_df = arg_df[["arg_id", "topic", "stance"]]
    predictions_df = load_predictions(predictions_file, kp_df["key_point_id"].unique())

    #make sure each arg_id has a prediction
    predictions_df = pd.merge(arg_df, predictions_df, how="left", on="arg_id")

    #handle arguements with no matching key point
    predictions_df["key_point_id"] = predictions_df["key_point_id"].fillna("dummy_id")
    predictions_df["score"] = predictions_df["score"].fillna(0)

    #merge each argument with the gold labels
    merged_df = pd.merge(predictions_df, labels_df, how="left", on=["arg_id", "key_point_id"])

    merged_df.loc[merged_df['key_point_id'] == "dummy_id", 'label'] = 0
    merged_df["label_strict"] = merged_df["label"].fillna(0)
    merged_df["label_relaxed"] = merged_df["label"].fillna(1)


    print("\n** predictions analysis:")
    for desc, group in merged_df.groupby(["stance", "topic"]):
        not_dummies = group[group["key_point_id"] != "dummy_id"]
        print(f"\t{desc}:")
        print(f"\t\tsubmitted matched for {len(not_dummies)/len(group):.2} of the arguments ({len(not_dummies)}/{len(group)})")


    return merged_df

def get_ap(df, label_column, top_percentile=0.5):
    top = int(len(df)*top_percentile)
    df = df.sort_values('score', ascending=False).head(top)
    # after selecting top percentile candidates, we set the score for the dummy kp to 1, to prevent it from increasing the precision.
    df.loc[df['key_point_id'] == "dummy_id", 'score'] = 0.99
    return average_precision_score(y_true=df[label_column], y_score=df["score"])

def calc_mean_average_precision(df, label_column):
    precisions = [get_ap(group, label_column) for _, group in df.groupby(["topic", "stance"])]
    return np.mean(precisions)

def evaluate_predictions(merged_df,name = 'train'):
    #print("\n** running evalution:")
    mAP_strict = calc_mean_average_precision(merged_df, "label_strict")
    mAP_relaxed = calc_mean_average_precision(merged_df, "label_relaxed")


    print(f"mAP strict= {mAP_strict} ; mAP relaxed = {mAP_relaxed}")

In [38]:
#path_dataset = '/kaggle/input/meltemi-data/'
#path_predictions_folder = './'

Evalauation of labeled/unlabeled dev set


In [39]:
#arg_df, kp_df, labels_df = load_kpm_data(path_dataset, subset="dev")
#dev_df_selected = dev_df[['arg_id', 'key_point_id']]
#new_dev= pd.concat([dev_df_selected, dev_proba_df], axis=1)
#new_dev.rename(columns={1: 'score'}, inplace=True)
###############################################################################################################################
#pred_dev_df_selected=pred_dev_df[['arg_id', 'key_point_id']]
#new_pred_dev= pd.concat([pred_dev_df_selected, pred_dev_proba_df], axis=1)
#new_pred_dev.rename(columns={1: 'score'}, inplace=True)
########################################################################################################################
#save_predictions_name = 'seq_cls_meltemi_7b_base'  
#####################################################################################################################
#args = {}
#kps = {}

#for arg,kp,score in zip(new_dev['arg_id'],new_dev['key_point_id'],new_dev['score']):
#  args[arg] = {}
#for arg,kp,score in zip(new_dev['arg_id'],new_dev['key_point_id'],new_dev['score']):
#  args[arg][kp] = score

#with open(path_predictions_folder + save_predictions_name + '_' + 'predictions_kp_arg_weighted_dev.p.', 'w') as fp:
#  fp.write(json.dumps(args))
#  fp.close()
    
############################################################################################################################
#args_pred = {}
#kps_pred = {}

#for arg,kp,score in zip(new_pred_dev['arg_id'],new_pred_dev['key_point_id'],new_pred_dev['score']):
#  args_pred[arg] = {}
#for arg,kp,score in zip(new_pred_dev['arg_id'],new_pred_dev['key_point_id'],new_pred_dev['score']):
#  args_pred[arg][kp] = score

#with open(path_predictions_folder + save_predictions_name + '_' + 'predictions_kp_arg_weighted_pred_dev.p.', 'w') as fp:
#  fp.write(json.dumps(args_pred))
#  fp.close()    

    
############################################################################################################################    
#merged_dev_df = get_predictions(path_predictions_folder + save_predictions_name + '_' + 'predictions_kp_arg_weighted_dev.p.', labels_df, arg_df, kp_df)
#merged_dev_df.to_csv('merged_df_cls_kp_arg_weighted_dev', index=False)
##########################################################################################################################
#merged_pred_dev_df = get_predictions(path_predictions_folder + save_predictions_name + '_' + 'predictions_kp_arg_weighted_pred_dev.p.', labels_df, arg_df, kp_df)
#merged_pred_dev_df.to_csv('merged_df_cls_kp_arg_weighted_pred_dev', index=False)

########################################################################################################
#evaluate_predictions(merged_dev_df)
########################################################################################################
#evaluate_predictions(merged_pred_dev_df)

Evaluation unlabelled test set

In [40]:
#arg_df, kp_df, labels_df = load_kpm_data(path_dataset, subset="test")
#########################################################################################################################
#pred_test_df_selected=pred_test_df[['arg_id', 'key_point_id']]
#new_pred_test= pd.concat([pred_test_df_selected, pred_test_proba_df], axis=1)
#new_pred_test.rename(columns={1: 'score'}, inplace=True)
#########################################################################################################################
#args = {}
#kps = {}

#for arg,kp,score in zip(new_pred_test['arg_id'],new_pred_test['key_point_id'],new_pred_test['score']):
#  args[arg] = {}
#for arg,kp,score in zip(new_pred_test['arg_id'],new_pred_test['key_point_id'],new_pred_test['score']):
#  args[arg][kp] = score

#with open(path_predictions_folder + save_predictions_name + '_' + 'predictions_kp_arg_weighted_pred_test.p.', 'w') as fp:
#  fp.write(json.dumps(args))
#  fp.close()
    
##########################################################################################################################
#merged_pred_test_df = get_predictions(path_predictions_folder + save_predictions_name + '_' + 'predictions_kp_arg_weighted_pred_test.p.', labels_df, arg_df, kp_df)
#merged_pred_test_df.to_csv('merged_df_cls_kp_arg_weighted_pred_test', index=False)    

########################################################################################################
#evaluate_predictions(merged_pred_test_df)

#Merge the adapter weights to the base LLM and push to hf hub

In [41]:
# Save trained model
#trainer.model.save_pretrained("meltemi_base_finetuning_kpm")

#from peft import AutoPeftModelForSequenceClassification

#instruction_tuned_model=AutoPeftModelForSequenceClassification.from_pretrained(
#    training_args.output_dir,
#    torch_dtype=torch.bfloat16,
#    trust_remote_code=True)

#merged_model=instruction_tuned_model.merge_and_unload()

#HF_USERNAME="Kleo"
#HF_REPO_NAME="Meltemi_7b_v1_base_finetuned_seq_cls_kpm"
#merged_model.push_to_hub(f"{HF_USERNAME}/{HF_REPO_NAME}")
#tokenizer.push_to_hub(f"{HF_USERNAME}/{HF_REPO_NAME}")