In [None]:
!pip install transformers datasets
!pip install tqdm
!pip install -qq bitsandbytes
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [None]:
!huggingface-cli login --token $secret_hf

In [None]:
from transformers import pipeline,BitsAndBytesConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from datasets import Dataset, DatasetDict
import re

In [5]:
df_kp = pd.read_csv('/kaggle/input/kps-filtered/generated_kps_filtered.csv')
df_kp['key_point_id'] = 'kp_0_0'


In [6]:
# Sort by 'topic' column to ensure rows are grouped by topic
df_kp = df_kp.sort_values('topic').reset_index(drop=True)
# Initialize variables
topic_to_id = {}      # Dictionary to map each topic to a unique numeric ID
keypoint_ids = []
keypoint_counter = 0   # Counter for keypoints within each topic
topic_id = 0           # Counter for unique topics

# Generate key_point_id for each row
for _, row in df_kp.iterrows():
    topic = row['topic']  # assuming 'topic' column exists
    # If topic is new, assign a new topic_id and reset keypoint counter
    if topic not in topic_to_id:
        topic_to_id[topic] = topic_id
        topic_id += 1
        keypoint_counter = 0  # Reset keypoint counter for new topic
    else:
        # Retrieve existing topic_id for the current topic
        topic_id_for_current_topic = topic_to_id[topic]
    # Create key_point_id and store it
    keypoint_id = f"kp_{topic_to_id[topic]}_{keypoint_counter}"
    keypoint_ids.append(keypoint_id)
    # Increment keypoint counter for the current topic
    keypoint_counter += 1



# Assign the generated key_point_ids to the new column
df_kp['key_point_id'] = keypoint_ids

In [7]:
df_arg = pd.read_csv('/kaggle/input/meltemi-data/arguments_human_translated_test.csv')
print(df_arg.shape,df_kp.shape)

df_kp.head()

def give_tuples(df_arguments,df_key_points):
  arguments = []
  key_points = []
  topics = []
  stance = []
  arg_ids =[]
  kp_ids = []
  for i,j,k,l in tqdm(zip(df_arguments['argument'],df_arguments['topic'],df_arguments['stance'],df_arguments['arg_id'])):
    df = df_key_points[(df_key_points['topic'] == j) & (df_key_points['stance'] == k)]
    #print()
    #print(j)
    #print(k)
    if(df.shape[0]>0):
      for o,p in zip(df['Meltemi_Instruct_16shot'],df['key_point_id']):
        arguments.append(i)
        key_points.append(o)
        topics.append(j)
        stance.append(k)
        arg_ids.append(l)
        kp_ids.append(p)

  return arguments,key_points,topics, stance,arg_ids,kp_ids

args,kps,topics,stance,arg_ids,kp_ids = give_tuples(df_arg,df_kp)
test_df = pd.DataFrame({'arg_id':arg_ids,'key_point_id':kp_ids,'argument':args, 'keypoint':kps, 'topic' : topics , 'stance': stance})

test_df.head()

(723, 4) (30, 7)


723it [00:00, 1809.74it/s]


Unnamed: 0,arg_id,key_point_id,argument,keypoint,topic,stance
0,arg_0_0,kp_1_4,Οι εμβολιασμοί ρουτίνας δεν είναι απαραίτητοι ...,Η φροντίδα των παιδιών είναι ευθύνη των γονέων,Οι παιδικοί εμβολιασμοί ρουτίνας θα πρέπει να ...,-1
1,arg_0_0,kp_1_5,Οι εμβολιασμοί ρουτίνας δεν είναι απαραίτητοι ...,Ο εμβολιασμός μπορεί να έχει απρόβλεπτες παρεν...,Οι παιδικοί εμβολιασμοί ρουτίνας θα πρέπει να ...,-1
2,arg_0_0,kp_1_6,Οι εμβολιασμοί ρουτίνας δεν είναι απαραίτητοι ...,Η προσωπική αυτονομία είναι πιο σημαντική από ...,Οι παιδικοί εμβολιασμοί ρουτίνας θα πρέπει να ...,-1
3,arg_0_0,kp_1_7,Οι εμβολιασμοί ρουτίνας δεν είναι απαραίτητοι ...,Η υποχρεωτικότητα των εμβολιασμών παραβιάζει τ...,Οι παιδικοί εμβολιασμοί ρουτίνας θα πρέπει να ...,-1
4,arg_0_0,kp_1_8,Οι εμβολιασμοί ρουτίνας δεν είναι απαραίτητοι ...,Τα εμβόλια μπορεί να είναι επιβλαβή για τα παιδιά,Οι παιδικοί εμβολιασμοί ρουτίνας θα πρέπει να ...,-1


In [None]:
test_df["kp_arg"] = 'Keypoint: ' + test_df["keypoint"].str.strip() + "; " + 'Επιχείρημα: ' + test_df["argument"].str.strip()

# Load adapters from the Hub

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)


peft_model_id = "Kleo/meltemi_arg2kp_matcher"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, return_dict=True, quantization_config=bnb_config, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

model.eval()

In [None]:
# Perform inference on a specific column
results = []
for text in test_df["kp_arg"]:
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt")
    # Perform inference
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get predicted label
    predicted_class_id = logits.argmax().item()
    # Store result
    results.append(predicted_class_id)

# Add the predictions to the DataFrame
test_df["Predicted_Label"] = results
test_df_selected = test_df[['arg_id','key_point_id','Predicted_Label']]


In [None]:
test_df_selected.to_csv('final_preds_with_trainer_predict_filtered')