In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Setting Up PyTorch and Dependencies

In [None]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth
!pip install --no-deps trl peft accelerate bitsandbytes datasets

# Importing Libraries for LLM Training and Data Processing

In [None]:
# Data Processing and Visualization
import os
import numpy as np
import pandas as pd

# Libraries for Training LLMs
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import Dataset, load_dataset, DatasetDict

# Model Saving
from transformers import AutoModelForSequenceClassification

# LLM Model and Tokenizer Utilities
from transformers import (AutoModelForCausalLM, 
                          DataCollatorForLanguageModeling,
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments,
                          Trainer,
                          DataCollatorForSeq2Seq,
                          logging)

# Evaluation Metrics
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

# Warnings
import warnings
warnings.filterwarnings("ignore")

# Loading and Configuring the LLaMA 3 Model

In [None]:
# Define the base model path  
base_model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

# Set model configuration parameters  
max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True

# Load the pre-trained model and tokenizer  
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Apply LoRA (Low-Rank Adaptation) for parameter-efficient fine-tuning  
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias = "none",
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    loftq_config = None,
    random_state = 3407
)

print(model.print_trainable_parameters())

# Formatting Text for Sentiment Classification Input-Output Pairs

In [None]:
# Define the prompt template for classification  
data_prompt = """Classify the text into 'Positive', 'Negative', and return the answer as the predicted sentiment.
### Input:
{}
### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompt(examples):
    inputs       = examples["sentence"]
    outputs      = examples["label"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

# Load training and testing datasets

In [None]:
train_df = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")
test_df = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/test.csv")

# Converting DataFrames to Datasets and Applying Preprocessing

In [None]:
training_data = Dataset.from_pandas(train_df)
testing_data = Dataset.from_pandas(test_df)
training_data = training_data.map(formatting_prompt, batched=True)

# Configuring Hyperparameters for Model Training

In [None]:
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

sft_config = SFTConfig(
    learning_rate = 3e-4,
    dataset_text_field="text",  
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=20,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    lr_scheduler_type = "linear",
    weight_decay=0.01,
    warmup_steps=5,
    max_steps=70,
    output_dir="output",
    seed=3407,
    report_to="none",
)


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=training_data,
    args=sft_config,
)


# Model Training

In [None]:
trainer.train()

In [None]:
model = FastLanguageModel.for_inference(model)

In [None]:
X_test = testing_data.to_pandas()
X_train = training_data.to_pandas()
unique_labels = X_train['label'].unique()
print(unique_labels)

# Making Predictions with the Model on Test Data

In [None]:
from tqdm import tqdm

def predict(test, model, tokenizer):
    global unique_labels
    y_pred = []
    categories = unique_labels
    # model.config.pad_token_id = model.config.eos_token_id
    
    for i in tqdm(range(len(test))):
        sent = test.iloc[i]["sentence"]

        # Tokenize input
        inputs = tokenizer(
            [data_prompt.format(sent,"",)], return_tensors="pt").to("cuda")

        # Generate output from the model
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=10, temperature=0.1)

        # Decode output
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract label from generated text
        answer = answer.split("### Response:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("Positive")
    
    return y_pred

y_pred = predict(X_test, model, tokenizer)

In [None]:
submission = pd.DataFrame({
    "ID": range(1,101),
    "label": y_pred
})

submission.to_csv("submission.csv", index = False)