## Setting up

In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [2]:
import wandb
wandb.init(mode="disabled")



In [3]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

2024-10-13 12:55:12.073094: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-13 12:55:12.073205: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-13 12:55:12.199567: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Loading and processing the dataset

In [4]:
# df = pd.read_csv("/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv",index_col = "Unnamed: 0")
# df.loc[:,'status'] = df.loc[:,'status'].str.replace('Bi-Polar','Bipolar')
# df = df[(df.status != "Personality disorder") & (df.status != "Stress") & (df.status != "Suicidal")]
# df.head()

In [5]:
sampled_df = pd.read_json('/kaggle/input/coling-25-task-1/multilingual_dev.jsonl', lines=True)
percentage = 1 

# Sample x percent of the DataFrame
df = sampled_df.sample(frac=percentage, random_state=42)  # random_state for reproducibility

df

Unnamed: 0,id,source,sub_source,lang,model,label,text
14032,c5f6649d-e63e-412a-9ee9-6c770a9741ce,mage,yelp,en,human,0,Pizza was delivered in a timely fashion but it...
285814,1ec924cf-0539-4f7b-9a4f-35746a5c6d35,mage,wp,en,human,0,"Mom, Jane and I want to know who you're going ..."
154227,e460e7ec-4ad0-407f-84be-ea04f3186c32,hc3,reddit_eli5,en,human,0,Is there any species in particular you are ref...
242382,e3311384-e600-414c-83e3-870df2e945b3,m4gt,arxiv,en,human,0,The selection of features that are relevant fo...
186635,2e38e33e-360c-46ea-b41a-144f4163a63a,mage,wp,en,7B,1,"ninja edit: I read ""paradise"" not ""parasite."" ..."
...,...,...,...,...,...,...,...
119879,18411c12-fb33-4566-9403-e79a3d7bda2c,m4gt,CHANGE-it NEWS,it,llama2-fine-tuned,1,"Siate efficienti e meno occupati "". Sono quest..."
259178,a177d884-5d67-493e-adca-02cbcfe73375,m4gt,outfox,en,llama3-70b,1,The more the better ! Getting Advice from many...
131932,7448626d-49b0-473d-81c2-938ad1db78a6,m4gt,reddit,en,human,0,I don't know the text in the MS law specifical...
146867,d087e101-0ad4-4c7c-90be-a7e9c9c07e8e,mage,xsum,en,human,0,"Ian Coulter, formerly of Tughans, has been que..."


In [6]:
import pandas as pd

# Assuming df is the DataFrame you provided

# Group by 'lang' and find the minimum group size
min_size = df['lang'].value_counts().min()

# Sample min_size rows for each language
balanced_df = df.groupby('lang').apply(lambda x: x.sample(min_size)).reset_index(drop=True)
df = balanced_df
# Display the balanced DataFrame
balanced_df


  balanced_df = df.groupby('lang').apply(lambda x: x.sample(min_size)).reset_index(drop=True)


Unnamed: 0,id,source,sub_source,lang,model,label,text
0,fbaff8ff-f58e-4f8f-a2b1-354898940887,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,قال وزير الدفاع البريطاني، بن والاس، إن بريطان...
1,e6722859-fd0f-4409-93a3-8d3f8a814b53,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,حذرت إسرائيل حركتي المقاومة الإسلامية (حماس) و...
2,61f490f4-5dc4-48d5-9105-d82e6e86f52d,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,أشار الدكتور بيرجر دولتس إلى أن اضطراب الشخصية...
3,98c3962d-08d2-48fe-bf82-6fce0ca1df4a,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,فريدة أحمد تسعى غالبية النساء إلى الحفاظ على ن...
4,9a89e0f1-1998-430d-9fa1-ea80349cf36e,m4gt,News/Wikipedia,ar,gpt-3.5-turbo,1,أعلنت شركة أميركية ناشئة عن بيع روبوتها الجديد...
...,...,...,...,...,...,...,...
5395,7e389a81-f356-42e1-95a9-616ca5482abb,hc3,law,zh,human,0,律师费诉前交，法院费用也是诉前。估计下来一万多。
5396,e95a6a83-7133-4c39-afc5-7df7afc50938,hc3,baike,zh,gpt-35,1,手机定位跟踪器是一种能够跟踪手机用户位置的技术。它通过使用手机的GPS、Wi-Fi或移动网络...
5397,14c31e57-33ba-491e-854e-17ad4647f47c,hc3,open_qa,zh,gpt-35,1,学习识别各种车辆可能有一些挑战，但是有一些技巧可以帮助你更容易地区分不同的车型。 \n1. ...
5398,0a6c1e66-e939-43b6-9dca-9186de29650e,hc3,medicine,zh,gpt-35,1,羊角风是一种慢性传染病，其主要症状是肝脏、脾脏受到损害，导致肝功能异常。羊角风患者应避免食用...


In [7]:
df.lang.value_counts()

lang
ar    600
bg    600
de    600
en    600
id    600
it    600
ru    600
ur    600
zh    600
Name: count, dtype: int64

In [8]:
# Function to convert labels
def convert_label(label):
    return "human" if label == 0 else "machine"

# Apply the conversion function to the 'label' column
df['label'] = df['label'].apply(convert_label)

# Display the resulting DataFrame
df = df[['text','label']]
df.rename(columns={'text': 'statement', 'label': 'status'}, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'text': 'statement', 'label': 'status'}, inplace=True)


Unnamed: 0,statement,status
0,قال وزير الدفاع البريطاني، بن والاس، إن بريطان...,machine
1,حذرت إسرائيل حركتي المقاومة الإسلامية (حماس) و...,machine
2,أشار الدكتور بيرجر دولتس إلى أن اضطراب الشخصية...,machine
3,فريدة أحمد تسعى غالبية النساء إلى الحفاظ على ن...,machine
4,أعلنت شركة أميركية ناشئة عن بيع روبوتها الجديد...,machine
...,...,...
5395,律师费诉前交，法院费用也是诉前。估计下来一万多。,human
5396,手机定位跟踪器是一种能够跟踪手机用户位置的技术。它通过使用手机的GPS、Wi-Fi或移动网络...,machine
5397,学习识别各种车辆可能有一些挑战，但是有一些技巧可以帮助你更容易地区分不同的车型。 \n1. ...,machine
5398,羊角风是一种慢性传染病，其主要症状是肝脏、脾脏受到损害，导致肝功能异常。羊角风患者应避免食用...,machine


In [9]:
# Shuffle the DataFrame and select only 3000 rows
df = df.sample(frac=1, random_state=85).reset_index(drop=True)

# Split the DataFrame
train_size = 0.8
eval_size = 0.1

# Calculate sizes
train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

# Split the data
X_train = df[:train_end]
X_eval = df[train_end:eval_end]
X_test = df[eval_end:]


System_message= "You are an advanced AI model specialized in detecting whether a given text is machine-generated or human-written. Your expertise allows you to analyze texts in various languages with accuracy."

# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
    {System_message}
    Please classify the following text and provide your answer as either "machine generated" or "human written".

    Text: {data_point["statement"]}
    label: {data_point["status"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
    {System_message}
    Please classify the following text and provide your answer as either "machine generated" or "human written".

    Text: {data_point["statement"]}
    label: """.strip()

# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

X_test = X_test.sample(frac=0.3, random_state=42)  # random_state for reproducibility

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'status']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)


In [10]:
X_train.status.value_counts(),X_eval.status.value_counts()

(status
 machine    2809
 human      1511
 Name: count, dtype: int64,
 status
 machine    335
 human      205
 Name: count, dtype: int64)

In [11]:
y_true.value_counts()

status
machine    105
human       57
Name: count, dtype: int64

In [12]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [13]:
train_data['text'][3]

'You are an advanced AI model specialized in detecting whether a given text is machine-generated or human-written. Your expertise allows you to analyze texts in various languages with accuracy.\n\n    Please classify the following text and provide your answer as either "machine generated" or "human written".\n\n    Text: организации из числа юридических лиц (за исключением органа местного самоуправления и муниципального учреждения муниципальных образований автономного округа, государственного учреждения автономного округа, религиозной и общественной организации), отнесенные к крупным предприятиям, к средним предприятиям в соответствии со статьей 4 Федерального закона от 24 июля 2007 года № 209-ФЗ «О развитии малого и среднего предпринимательства в Российской Федерации», работники которых находятся под риском увольнения (простой, введение режима неполного рабочего времени, предоставление отпусков без сохранения заработной платы по инициативе работодателей, проведение мероприятий по высв

## Loading the model and tokenizer

In [14]:
base_model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

## Model evalution before fine-tuning

In [16]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["human", "machine"]
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=20, 
                        temperature=0.4)
        
        result = pipe(prompt)
#         print(result)
        answer = result[0]['generated_text'].split("label:")[-1].strip()
        
        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred

In [17]:
def evaluate(y_true, y_pred):
    labels = ["human", "machine"]
    mapping = {label: idx for idx, label in enumerate(labels)}
    
    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data
    
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [18]:

y_pred = predict(X_test, model, tokenizer)
print(y_pred)
evaluate(y_true, y_pred)

  0%|          | 0/162 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
100%|██████████| 162/162 [07:32<00:00,  2.79s/it]

['machine', 'human', 'machine', 'machine', 'machine', 'human', 'machine', 'human', 'machine', 'machine', 'human', 'human', 'machine', 'machine', 'machine', 'human', 'none', 'machine', 'machine', 'machine', 'machine', 'machine', 'human', 'machine', 'human', 'machine', 'none', 'machine', 'machine', 'machine', 'machine', 'human', 'machine', 'human', 'machine', 'machine', 'machine', 'machine', 'human', 'none', 'machine', 'machine', 'machine', 'human', 'machine', 'machine', 'machine', 'human', 'machine', 'human', 'machine', 'machine', 'machine', 'machine', 'machine', 'machine', 'human', 'human', 'machine', 'machine', 'human', 'none', 'human', 'machine', 'human', 'machine', 'human', 'machine', 'machine', 'machine', 'machine', 'machine', 'human', 'machine', 'machine', 'machine', 'machine', 'machine', 'machine', 'human', 'machine', 'machine', 'machine', 'machine', 'machine', 'machine', 'machine', 'machine', 'human', 'machine', 'machine', 'machine', 'machine', 'human', 'machine', 'machine', 'ma




## Extracting the linear modules names

In [19]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [20]:
modules = find_all_linear_names(model)
modules

['gate_proj', 'down_proj', 'v_proj', 'k_proj', 'q_proj', 'up_proj', 'o_proj']

## Setting up the model

In [21]:
output_dir="llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    lora_alpha=8,
    lora_dropout=0,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=1,                         
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    save_total_limit=2,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    eval_strategy="steps",                    # save checkpoint every epoch
    eval_steps = 0.1
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=512,
    packing=False,
    dataset_kwargs={
    "add_special_tokens": False,
    "append_concat_token": False,
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

Map:   0%|          | 0/540 [00:00<?, ? examples/s]

## Model Training

In [None]:
print("Training started")

# Train model
trainer.train()

print("Training ended")




Step,Training Loss,Validation Loss


In [None]:
wandb.finish()
model.config.use_cache = True

## Saving the model and tokenizer

In [None]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
import shutil
import os

# Define your variables
dir_to_zip = output_dir

# Define the name of the output zip file
last_word = model.split('/')[-1]
output_zip = f"mullin_{output_dir}.zip"

shutil.make_archive(output_zip.replace('.zip', ''), 'zip', dir_to_zip)
print(f"Zipped contents of {dir_to_zip} into {output_zip}")

## Testing model after fine-tuning 

In [None]:
sub_df = pd.read_json('/kaggle/input/coling-25-task-1/multilingual_devtest_text_id_only.jsonl', lines=True)
sub_df

def generate_test_prompt_sub(data_point):
    return f"""
    {System_message}
    Please classify the following text and provide your answer as either "machine generated" or "human written".

    Text: {data_point["text"]}
    label: """.strip()

sub_df['text'] = sub_df.apply(generate_test_prompt_sub, axis=1)
sub_df = sub_df.sample(frac=0.01, random_state=42)  # random_state for reproducibility

print(len(sub_df))

predictions = predict(sub_df, model, tokenizer)

print("prediction ended")

prediction_file_taska = 'task_b_llm_after_training.jsonl' 
predictions_df = pd.DataFrame({'id': sub_df.id, 'label': predictions})
predictions_df.to_json(prediction_file_taska, lines=True, orient='records')
predictions_df

In [None]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)