# Install dependencies

In [None]:
!pip install transformers datasets evaluate accelerate scikit-learn pandas --quiet
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet
!pip install onnx onnxruntime-gpu optimum[onnxruntime-gpu] huggingface_hub --quiet

# Import Libraries

In [None]:
import torch
import os
import numpy as np
import pandas as pd
from pathlib import Path
import shutil
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from optimum.onnxruntime import ORTModelForSequenceClassification
import onnxruntime as ort
from huggingface_hub import Repository, create_repo, login
import warnings
warnings.filterwarnings('ignore')

# Set Configuration

In [None]:
class Config:
    # Model Selection
    MODEL_NAME = "Shushant/nepaliBERT"

    # Training Hyperparameters
    MAX_LENGTH = 256
    BATCH_SIZE = 16
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 5
    WARMUP_RATIO = 0.1
    WEIGHT_DECAY = 0.01

    # Class Configuration
    NUM_LABELS = 3

    # Data Split
    TEST_SIZE = 0.15
    VAL_SIZE = 0.15
    RANDOM_SEED = 42

config = Config()

# Set cuda usage

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    # Enable TF32 for faster training on Ampere GPUs
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

# Load Dataset

In [None]:
# Enter your kaggle username and key here
# os.environ["KAGGLE_USERNAME"] = ""
# os.environ["KAGGLE_KEY"] = ""

In [None]:
!kaggle datasets download -d mathew11111/nepcov19tweets -p . --unzip

In [None]:
!curl -L -o dataset2.csv "https://raw.githubusercontent.com/sagarl123/NepaliNLP-SentimentAnalysis/refs/heads/main/collected_labeled_data.csv"

In [None]:
df1 = pd.read_csv("covid19_tweeter_dataset.csv")
df2 = pd.read_csv("dataset2.csv")

# Clean the dataset for use

In [None]:
df1 = df1[['Label', 'Tweet']]

In [None]:
df1.drop(df1.index[~df1['Label'].isin([-1, 0, 1])], inplace=True)

In [None]:
df1 = df1.rename(columns={
    "Label": "labels",
    "Tweet": "text"
})

In [None]:
df1['labels'] = df1['labels'].map({1: 2, 0: 1, -1: 0})

In [None]:
df2.columns

In [None]:
df2 = df2.rename(columns={
    'label': 'labels'
})

In [None]:
# Merge the dataframes
df = pd.concat([df1, df2], ignore_index=True)

In [None]:
# Ensure labels and texts are in correct format
df['labels'] = df['labels'].astype(int)
df['text'] = df['text'].astype(str)

In [None]:
df['labels'].value_counts()

# Train Test Validation Split

In [None]:
train_df, temp_df = train_test_split(
    df,
    test_size=(config.TEST_SIZE + config.VAL_SIZE),
    stratify=df['labels'],
    random_state=config.RANDOM_SEED
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=config.TEST_SIZE / (config.TEST_SIZE + config.VAL_SIZE),
    stratify=temp_df['labels'],
    random_state=config.RANDOM_SEED
)

In [None]:
print(f"   Train: {len(train_df)} samples")
print(f"   Validation: {len(val_df)} samples")
print(f"   Test: {len(test_df)} samples\n")

# Conversion to HuggingFace dataset

In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df.reset_index(drop=True)),
    'validation': Dataset.from_pandas(val_df.reset_index(drop=True)),
    'test': Dataset.from_pandas(test_df.reset_index(drop=True))
})

# Load Tokenizer and Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    config.MODEL_NAME,
    num_labels=config.NUM_LABELS,
    problem_type="single_label_classification",
).to(device)

# Tokenization

In [None]:
def tokenize_function(examples):
    """Tokenize text with proper padding and truncation"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=config.MAX_LENGTH,
        return_tensors=None
    )

In [None]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_dataset.set_format("torch")

# Compute metrics for evaluation

In [None]:
def compute_metrics(eval_pred):
    """Calculate accuracy and macro F1 score"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    macro_f1 = f1_score(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)

    # Per-class F1 scores
    f1_per_class = f1_score(labels, predictions, average=None)

    return {
        'macro_f1': macro_f1,
        'accuracy': accuracy,
        'f1_negative': f1_per_class[0],
        'f1_positive': f1_per_class[1],
        'f1_neutral': f1_per_class[2]
    }

# Custom trainer with balanced class weights

In [None]:
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_df['labels']),
    y=train_df['labels']
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

In [None]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Apply class weights to loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",

    # Training hyperparameters
    num_train_epochs=config.NUM_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE * 2,
    learning_rate=config.LEARNING_RATE,
    weight_decay=config.WEIGHT_DECAY,
    warmup_ratio=config.WARMUP_RATIO,

    # Evaluation strategy
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,

    # Optimization
    fp16=torch.cuda.is_available(),  # Mixed precision training
    gradient_accumulation_steps=2,
    gradient_checkpointing=False,

    # Misc
    save_total_limit=2,  # Keep only 2 best checkpoints
    seed=config.RANDOM_SEED,
    report_to="none",
    disable_tqdm=False,
)

# Initialize Trainer

In [None]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train model

In [None]:
train_result = trainer.train()

# Evaluation

In [None]:
test_results = trainer.evaluate(tokenized_dataset["test"])
print("Test Set Results:")
print(f"   Macro F1 Score: {test_results['eval_macro_f1']:.4f}")
print(f"   Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"   F1 (Negative): {test_results['eval_f1_negative']:.4f}")
print(f"   F1 (Neutral): {test_results['eval_f1_neutral']:.4f}")
print(f"   F1 (Positive): {test_results['eval_f1_positive']:.4f}\n")

In [None]:
# Detailed classification report
predictions = trainer.predict(tokenized_dataset["test"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print(classification_report(
    y_true,
    y_pred,
    target_names=['Negative', 'Positive', 'Neutral'],
    digits=4
))

# ONNX configurations

In [None]:
model = trainer.model

model.eval()

# Export to ONNX

In [None]:
model.save_pretrained("tmp_model")

In [None]:
ort_model = ORTModelForSequenceClassification.from_pretrained(
    'tmp_model',
    export=True,
)

In [None]:
ort_model.save_pretrained("tmp_model")

# Verify onnx model's working

In [None]:
# Test inference
test_text = "सेवाको गुणस्तर धेरै कमजोर छ, म सन्तुष्ट छैन।"
inputs = tokenizer(test_text, return_tensors="np", padding=True, return_token_type_ids=True)

onnx_path = Path('tmp_model') / 'model.onnx'

ort_sess = ort.InferenceSession(str(onnx_path), providers=["CUDAExecutionProvider"])

inputs = {
    "input_ids":       inputs["input_ids"].astype(np.int64),
    "attention_mask":  inputs["attention_mask"].astype(np.int64),
    "token_type_ids":  inputs["token_type_ids"].astype(np.int64),
}

logits = ort_sess.run(
    ['logits'],
    inputs
)

print("ONNX logits:", logits)

# Create model card

In [None]:
model_card = f"""---
language: ne
license: apache-2.0
tags:
- sentiment-analysis
- nepali
- onnx
- bert
- text-classification
datasets:
- custom-nepali-sentiment
metrics:
- f1
- accuracy
model-index:
- name: mohit4519/nepali-sentiment
  results:
  - task:
      type: text-classification
      name: Sentiment Analysis
    dataset:
      name: Nepali Sentiment Dataset
      type: custom
    metrics:
    - type: f1
      value: 0.XX  # Replace with your actual score
      name: Macro F1
---

# Nepali Sentiment Analysis (ONNX)

This model is a fine-tuned BERT model for Nepali sentiment analysis, exported to ONNX format for optimized inference.

## Model Details

- **Base Model**: Shushant/nepaliBERT
- **Task**: Sentiment Classification (3-class)
- **Labels**:
  - 0: Negative
  - 1: Positive
  - 2: Neutral
- **Format**: ONNX (optimized for fast inference)

## Usage

### Installation

```bash
pip install transformers optimum[onnxruntime]
```

### Inference

```python
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSequenceClassification
import torch

# Load model and tokenizer
model = ORTModelForSequenceClassification.from_pretrained("mohit4519/nepali-sentiment")
tokenizer = AutoTokenizer.from_pretrained("{config.MODEL_NAME}")

# Predict sentiment
text = "यो धेरै राम्रो छ"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1).item()

sentiment_map = {{0: 'Negative', 1: 'Positive', 2: 'Neutral'}}
print(f"Sentiment: {{sentiment_map[prediction]}}")
```

## Performance

- **Macro F1 Score**: 0.78
- **Accuracy**: 0.8

## Training Data

Trained on Nepali sentiment dataset containing social media text, reviews, and comments.

## Limitations

- Best performance on Nepali text
- May have reduced accuracy on code-mixed or transliterated text
- Performance varies across different domains
"""

In [None]:
# Write your model_card variable to README.md
with open(os.path.join(Path('tmp_model'), "README.md"), "w", encoding="utf-8") as f:
    f.write(model_card)

# Add to HuggingFace

In [None]:
from huggingface_hub import HfApi
api.upload_folder(
    folder_path="tmp_model",
    repo_id="mohit4519/nepali-sentiment",
    repo_type="model",
)