In [1]:
pip install transformers scikit-learn pandas torch 

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary packages
import os
from sklearn.model_selection import train_test_split
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define a function to load and preprocess the dataset
def preprocess_dataset(file_path):
  
    df = pd.read_csv(file_path)

    print(f"Processing file: {file_path}")
    print("Columns in the DataFrame:", df.columns)

    sentiment_column_name = 'sentiment'
    if sentiment_column_name not in df.columns:
        print(f"Column '{sentiment_column_name}' not found in {file_path}.")
        return None 

    # Map sentiment labels using the defined mapping
    standard_label_mapping = {
        2: "positive",
        "Positive": "positive",
        "positive": "positive",
        "good": "positive",
        "excellent": "positive",
        "neutral": "neutral",
        "Neutral": "neutral",
        "okay": "neutral",
        "negative": "negative",
        1: "negative",
        "Negative": "negative",
        "bad": "negative",
        "poor": "negative"
    }


    df['sentiment'] = df[sentiment_column_name].map(standard_label_mapping)
    df['sentiment'] = df['sentiment'].fillna("neutral")  


    label_mapping = {
        "negative": 0,
        "neutral": 1,
        "positive": 2
    }
    df['label'] = df['sentiment'].map(label_mapping)

    # Return the processed dataframe
    return df



In [5]:
# Load and preprocess the data
file_paths = ["data/large_sentiment_analysis_feedback_tagalog_3k.csv"]
processed_datasets = [preprocess_dataset(fp) for fp in file_paths]
combined_df = pd.concat(processed_datasets, ignore_index=True)

print("Combined DataFrame:")
print(combined_df.head())

Processing file: data/large_sentiment_analysis_feedback_tagalog_3k.csv
Columns in the DataFrame: Index(['text', 'sentiment'], dtype='object')
Combined DataFrame:
                                                text sentiment  label
0           Masarap ang pagkain, worth it ang price!  positive      2
1  Mabagal mag-reply sa inquiries, nakaka-frustrate.  negative      0
2  Sobrang mabilis ang response sa inquiries, sol...  positive      2
3  Satisfied ako sa product, talagang sulit ang b...  positive      2
4  Ang saya ng buong experience ko dito, sobrang ...  positive      2


In [6]:
output_dir = './data_test2'
train_path = os.path.join(output_dir, 'train')
val_path = os.path.join(output_dir, 'val')
test_path = os.path.join(output_dir, 'test')

In [7]:
os.makedirs(train_path, exist_ok=True)
os.makedirs(val_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

In [8]:
# Splitting the data into train, validation, and test sets
train_df, temp_df = train_test_split(combined_df, test_size=0.3, random_state=42, stratify=combined_df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

In [9]:
# Save each split as a CSV file in its respective directory
train_df.to_csv(os.path.join(train_path, 'train.csv'), index=False)
val_df.to_csv(os.path.join(val_path, 'val.csv'), index=False)
test_df.to_csv(os.path.join(test_path, 'test.csv'), index=False)

In [10]:
# Load your fine-tuned model and tokenizer (no need to reload the base Twitter model)
tokenizer = RobertaTokenizer.from_pretrained('./model2/taglishV1')
model = RobertaForSequenceClassification.from_pretrained('./model2/taglishV1')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [11]:
# Tokenization function
def tokenize_data(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt')

In [12]:
# Tokenize and create datasets
def create_dataset(dataframe):
    encodings = tokenize_data(dataframe['text'].tolist())
    labels = torch.tensor(dataframe['label'].values)
    return TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)

train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df)
test_dataset = create_dataset(test_df)

In [13]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [14]:
# Training function
def train_model(model, train_loader, val_loader, num_epochs=3):
    optimizer = AdamW(model.parameters(), lr=5e-5)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_epochs * len(train_loader))
    
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tqdm(train_loader):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            total_loss += loss.item()

        # Validation after each epoch
        val_loss, val_accuracy = evaluate_model(model, val_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {total_loss / len(train_loader)}, Val Loss: {val_loss}, Val Accuracy: {val_accuracy}")

In [15]:
# Evaluation function
def evaluate_model(model, loader):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct_predictions += (predictions == labels).sum().item()

    accuracy = correct_predictions / len(loader.dataset)
    avg_loss = total_loss / len(loader)
    return avg_loss, accuracy

In [16]:
# Train the model
train_model(model, train_loader, val_loader, num_epochs=5)

100%|██████████| 132/132 [03:22<00:00,  1.54s/it]


Epoch 1/5, Train Loss: 0.02560285464498906, Val Loss: 0.0002739198785877369, Val Accuracy: 1.0


100%|██████████| 132/132 [03:38<00:00,  1.65s/it]


Epoch 2/5, Train Loss: 0.0002098320393556743, Val Loss: 0.0001676132770030406, Val Accuracy: 1.0


100%|██████████| 132/132 [03:52<00:00,  1.76s/it]


Epoch 3/5, Train Loss: 0.00014934609316696879, Val Loss: 0.00013323694092205502, Val Accuracy: 1.0


100%|██████████| 132/132 [03:49<00:00,  1.74s/it]


Epoch 4/5, Train Loss: 0.00012457160956921254, Val Loss: 0.00011678796551583721, Val Accuracy: 1.0


100%|██████████| 132/132 [03:52<00:00,  1.76s/it]


Epoch 5/5, Train Loss: 0.00011368011605968191, Val Loss: 0.0001115963692929403, Val Accuracy: 1.0


In [17]:
# Final Test Evaluation
test_loss, test_accuracy = evaluate_model(model, test_loader)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

Test Loss: 0.00011157860692413845, Test Accuracy: 1.0


In [18]:
# Save model and tokenizer
model.save_pretrained('./model3/taglishV1')
tokenizer.save_pretrained('./model3/taglishV1')

('./model3/taglishV1\\tokenizer_config.json',
 './model3/taglishV1\\special_tokens_map.json',
 './model3/taglishV1\\vocab.json',
 './model3/taglishV1\\merges.txt',
 './model3/taglishV1\\added_tokens.json')