# **Modul 6 AI LLM**

---

**Nama**: Michael Kenneth Salim <br>
**NRP**: 5027231008

## **Import Library**

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import re
import string

import torch
import torch.utils.data as data

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction, DataCollatorWithPadding
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

from huggingface_hub import login

## **EDA**

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

### **Visualisasi**

In [None]:
emotion_columns = ['amusement', 'anger', 'annoyance', 'caring', 'confusion',
                   'disappointment', 'disgust', 'embarrassment', 'excitement',
                   'fear', 'gratitude', 'joy', 'love', 'sadness']

emotion_counts = df_train[emotion_columns].sum()

plt.figure(figsize=(14, 8))

colors = ['#FF6B6B', '#FF8E53', '#FF6B9D', '#4ECDC4', '#45B7D1',
          '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8', '#F7DC6F',
          '#85C1E9', '#F8C471', '#EC7063', '#AED6F1']

bars = plt.bar(range(len(emotion_counts)), emotion_counts.values,
               color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)

plt.xlabel('Emotions', fontsize=12, fontweight='bold')
plt.ylabel('Number of Samples', fontsize=12, fontweight='bold')
plt.title('Distribution of Emotions in Training Data', fontsize=16, fontweight='bold', pad=20)

plt.xticks(range(len(emotion_counts)), emotion_counts.index, rotation=45, ha='right')

for i, (emotion, count) in enumerate(zip(emotion_counts.index, emotion_counts.values)):
    plt.text(i, count + max(emotion_counts.values) * 0.01, str(count),
             ha='center', va='bottom', fontweight='bold', fontsize=10)

plt.grid(axis='y', alpha=0.3, linestyle='--')

plt.tight_layout()
plt.show()

## **Data Pre-Processing**

In [None]:
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
df_train['cleaned_text'] = df_train['text'].apply(clean_text)
df_test['cleaned_text'] = df_test['text'].apply(clean_text)

In [None]:
df_train[['text', 'cleaned_text']][:15]

### Split

In [None]:
class EmotionDataset(data.Dataset):

    def __init__(self, texts, labels=None, tokenizer=None, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)

        return item

In [None]:
X = df_train['cleaned_text'].values
y = df_train[emotion_columns].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [None]:
print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(df_test)}")

### **Initiate 1st Model**

In [None]:
model_name = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
max_length = 256

train_data = EmotionDataset(X_train, y_train, tokenizer, max_length)
val_data = EmotionDataset(X_val, y_val, tokenizer, max_length)

test_data = EmotionDataset(df_test['cleaned_text'].values, None, tokenizer, max_length)

In [None]:
idtolabel = {i: label for i, label in enumerate(emotion_columns)}
labeltoid = {label: i for i, label in enumerate(emotion_columns)}
num_labels = len(emotion_columns)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification",
    id2label=idtolabel,
    label2id=labeltoid,
)

print(f"Model initialized with {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters")
print(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

In [None]:
training_args = TrainingArguments(
    output_dir='emotion_classifier',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    push_to_hub=True,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    greater_is_better=True,
    report_to=[],
    hub_model_id="KenetHilang/emotion-classifier",
)

In [None]:
def compute_metrics(eval_pred: EvalPrediction):
    predictions, labels = eval_pred
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = (probs > 0.5).int().numpy()
    y_true = labels

    f1_micro = f1_score(y_true, y_pred, average='micro')
    f1_macro = f1_score(y_true, y_pred, average='macro')

    return {
        'f1': f1_micro,
        'f1_macro': f1_macro,
    }

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

print("Trainer initialized successfully!")

In [None]:
trainer.train()

## **Upload Model to HuggingFace**

After training, the model will be automatically uploaded to HuggingFace due to the `push_to_hub=True` setting in TrainingArguments.

If you need to manually upload the model later, you can run:
```python
# trainer.push_to_hub()
```

## **Generate Test Predictions for Kaggle**

Now let's generate predictions for the test data and create the submission file.

In [None]:
# Prepare test data
test_texts = df_test['cleaned_text'].tolist()
test_data = Dataset.from_dict({'text': test_texts})
test_data = test_data.map(tokenize_function, batched=True)
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Generate predictions
print("Generating predictions for test data...")
predictions = trainer.predict(test_data)
test_predictions = torch.nn.Sigmoid()(torch.tensor(predictions.predictions))
test_predictions = (test_predictions > 0.5).int().numpy()

# Create submission file
submission = pd.DataFrame(test_predictions, columns=emotion_columns)
submission.insert(0, 'id', df_test['id'])  # Use the actual ID column from test.csv

submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")
print(f"Shape: {submission.shape}")
print("\nFirst 5 rows:")
print(submission.head())

# Check submission format
print("\nSubmission statistics:")
print(submission[emotion_columns].describe())