In [1]:
!pip install datasets transformers[torch] accelerate -q
!pip install spacy
!pip install https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0.tar.gz

import torch
import pandas as pd
import numpy as np
import re
import spacy
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from sklearn.metrics import accuracy_score, f1_score

# Ensure GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

Collecting https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0.tar.gz (11.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Using device: cuda
Tesla T4


In [2]:
print("\n--- Loading SentiTaglish dataset ---")
# Load the dataset from Hugging Face
dataset = load_dataset("ccosme/SentiTaglishProductsAndServices", split="train")
df = dataset.to_pandas()
df.rename(columns={'review': 'text'}, inplace=True)

# --- Heuristic Labeling for "Fake vs. Genuine" ---
# We define a rule: "Fake" reviews are often short and have extreme
# sentiment (1-star or 4-star), while "Genuine" reviews are more moderate or detailed.
# Target: 0 = Genuine, 1 = Fake. Initialize to -1 (Abstain)
df['label'] = -1

# HEURISTIC 1: Short (<50 chars) & Extreme Sentiment (1 or 4) -> FAKE (1)
df.loc[(df['sentiment'].isin([1, 4])) & (df['text'].str.len() < 50), 'label'] = 1

# HEURISTIC 2: Detailed Complaint (Long >100 chars & Negative) -> GENUINE (0)
df.loc[(df['sentiment'] == 1) & (df['text'].str.len() > 100), 'label'] = 0

# HEURISTIC 3: Moderate Sentiment (2 or 3) -> GENUINE (0)
df.loc[df['sentiment'].isin([2, 3]), 'label'] = 0

# Create the final, filtered DataFrame (this is our 'final_df')
final_df = df[df['label'] != -1].copy()
final_df['label'] = final_df['label'].astype(int)

print(f"\nSuccessfully created 'final_df' with {len(final_df)} labeled reviews.")
print("New label distribution (0=Genuine, 1=Fake):")
print(final_df['label'].value_counts())


--- Loading SentiTaglish dataset ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

SentiTaglish_ProductsAndServices.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/10510 [00:00<?, ? examples/s]


Successfully created 'final_df' with 6482 labeled reviews.
New label distribution (0=Genuine, 1=Fake):
label
0    6126
1     356
Name: count, dtype: int64


In [3]:
print("\n--- Executing NER & Lexical Feature Extraction ---")
# Load the stable, pre-installed multilingual spaCy model
try:
    nlp = spacy.load("xx_ent_wiki_sm")
    print("Successfully loaded multilingual spaCy model (xx_ent_wiki_sm).")
except Exception as e:
    print(f"FATAL ERROR loading spaCy model: {e}")

# Define the feature extraction function (with NaN-safe fix)
def extract_review_features(text):
    if not isinstance(text, str):
        return 0, 0, 0 # Handle empty/invalid rows

    doc = nlp(text)

    # 1. NER Proxy: Specificity (Counts numbers and dates)
    cardinal_count = sum(1 for ent in doc.ents if ent.label_ in ['CARDINAL', 'DATE'])

    # 2. Lexical Cues: Authenticity (First-Person Pronouns)
    pronoun_pattern = re.compile(r'\b(ako|ko|akin|I|my|mine)\b', re.IGNORECASE)
    pronoun_count = len(pronoun_pattern.findall(text))

    # 3. Lexical Cues: Hyperbole (Common in fake reviews)
    hype_pattern = re.compile(r'\b(super|sobrang|grabe|best ever|highly recommended|perfect)\b', re.IGNORECASE)
    hype_count = len(hype_pattern.findall(text))

    return cardinal_count, pronoun_count, hype_count

# Apply the function to create new columns in our DataFrame
# We use the stable .apply() method (axis=1)
final_df[['cardinal_count', 'pronoun_count', 'hype_count']] = final_df.apply(
    lambda row: extract_review_features(row['text']),
    axis=1,
    result_type='expand'
)
print("NER/Lexical features successfully added to 'final_df'.")


--- Executing NER & Lexical Feature Extraction ---
Successfully loaded multilingual spaCy model (xx_ent_wiki_sm).
NER/Lexical features successfully added to 'final_df'.


In [4]:
print("\n--- Setting up Text Classification ---")
# 1. Convert Pandas DataFrame back to Hugging Face Dataset
labeled_dataset = Dataset.from_pandas(final_df)

# 2. Create Train/Test Splits
labeled_datasets = labeled_dataset.train_test_split(test_size=0.2, seed=42)

# 3. Initialize Tokenizer and Model (Correctly!)
# We use the Auto* classes, which are flexible and recommended.
MODEL_NAME = "jcblaise/bert-tagalog-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 4. Tokenize the dataset
def tokenize_function(examples):
    # Use 'text' column, not 'review' or 'sentence'
    return tokenizer(examples["text"], truncation=True, padding="max_length")

tokenized_train = labeled_datasets["train"].map(tokenize_function, batched=True)
tokenized_eval = labeled_datasets["test"].map(tokenize_function, batched=True)

# 5. Rename 'label' to 'labels' (required by Trainer) and set format
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_eval = tokenized_eval.rename_column("label", "labels")
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 6. Load the Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2  # 2 labels: 0 (Genuine) and 1 (Fake)
).to(device)

print(f"Model Loaded: {MODEL_NAME} for 2-label (Fake/Genuine) classification.")

# 7. Define Metrics (Fixed)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    # Use "weighted" F1-score for imbalanced classes
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# 8. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    learning_rate=2e-5,  # Default for M1
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch", # Use "epoch"
    save_strategy="epoch",       # Use "epoch"
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    report_to="none", # Disable wandb
)

# 9. Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


--- Setting up Text Classification ---


tokenizer_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5185 [00:00<?, ? examples/s]

Map:   0%|          | 0/1297 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at jcblaise/bert-tagalog-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: jcblaise/bert-tagalog-base-uncased for 2-label (Fake/Genuine) classification.


  trainer = Trainer(


In [5]:
print("\n--- Starting Fine-Tuning ---")
trainer.train()

print("\n--- Final Evaluation Results ---")
eval_results = trainer.evaluate()
print(eval_results)

# Save the best model
trainer.save_model("./fake_review_model_best")
print("Best model saved to './fake_review_model_best'")


# --- INFERENCE (ADAPTED FOR YOUR PROJECT) ---
print("\n--- Running Inference on New Data ---")

# 1. Create a prediction pipeline using the *model* still in memory
# (This is fast and easy)
sentiment_analyzer = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# 2. Define new text data
new_data = [
    "grabe super ganda highly recommended",
    "ok lang. mabilis naman dumating pero mali yung kulay na pinadala ni seller. 3 stars.",
    "This is the worst product I have ever bought. It broke after one day.",
    "excellent product! 5 stars!",
    "super ganda! love it!"
]

# 3. Run Inference
results = sentiment_analyzer(new_data)

# The model will output LABEL_0 (Genuine) and LABEL_1 (Fake)
for text, result in zip(new_data, results):
    # Relabel the output for clarity
    label = "FAKE" if result['label'] == 'LABEL_1' else "GENUINE"

    print(f"Text: \"{text}\"")
    print(f"Prediction: {label} (Score: {result['score']:.4f})\n")



--- Starting Fine-Tuning ---


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.078,0.096232,0.969931,0.967552
2,0.0396,0.11785,0.971473,0.969216
3,0.009,0.110873,0.972244,0.972244



--- Final Evaluation Results ---


{'eval_loss': 0.1108732670545578, 'eval_accuracy': 0.9722436391673092, 'eval_f1': 0.9722436391673092, 'eval_runtime': 9.1837, 'eval_samples_per_second': 141.228, 'eval_steps_per_second': 8.929, 'epoch': 3.0}


Device set to use cuda:0


Best model saved to './fake_review_model_best'

--- Running Inference on New Data ---
Text: "grabe super ganda highly recommended"
Prediction: GENUINE (Score: 0.9996)

Text: "ok lang. mabilis naman dumating pero mali yung kulay na pinadala ni seller. 3 stars."
Prediction: GENUINE (Score: 0.9997)

Text: "This is the worst product I have ever bought. It broke after one day."
Prediction: GENUINE (Score: 0.9999)

Text: "excellent product! 5 stars!"
Prediction: GENUINE (Score: 0.9817)

Text: "super ganda! love it!"
Prediction: GENUINE (Score: 0.9994)



In [7]:
!pip install optuna

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    iris = load_iris()
    score = cross_val_score(clf, iris.data, iris.target, cv=3).mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print(study.best_params)



[I 2025-11-20 16:31:49,822] A new study created in memory with name: no-name-5d338ad5-f6d7-483f-90eb-d9ddf39c9f68
[I 2025-11-20 16:31:51,109] Trial 0 finished with value: 0.96 and parameters: {'n_estimators': 102, 'max_depth': 11}. Best is trial 0 with value: 0.96.
[I 2025-11-20 16:31:53,271] Trial 1 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 216, 'max_depth': 8}. Best is trial 1 with value: 0.9666666666666667.
[I 2025-11-20 16:31:54,905] Trial 2 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 92, 'max_depth': 4}. Best is trial 1 with value: 0.9666666666666667.
[I 2025-11-20 16:31:57,253] Trial 3 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 290, 'max_depth': 6}. Best is trial 1 with value: 0.9666666666666667.
[I 2025-11-20 16:31:58,098] Trial 4 finished with value: 0.9666666666666667 and parameters: {'n_estimators': 106, 'max_depth': 11}. Best is trial 1 with value: 0.9666666666666667.
[I 2025-11-20 16:3

{'n_estimators': 216, 'max_depth': 8}
