# TaglishProcessor

# This notebook implements a Taglish-safe preprocessing class using TensorFlow Tokenizer
# and scikit-learn for splitting data. It preserves punctuation and Tagalog stopwords.


In [None]:
## 1) Install & import libraries

# Run this cell in a notebook to install dependencies if they are missing.
# On Windows PowerShell, run: `pip install -r requirements.txt` (see project README)

import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [None]:
class TaglishProcessor:
    def __init__(self, vocab_size=5000, max_length=100, oov_token='<OOV>'):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.oov_token = oov_token
        self.tokenizer = None

    @staticmethod
    def clean_text(text):
        if pd.isna(text):
            return ""
        # Lowercase
        text = text.lower()
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', ' ', text)
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        # Keep punctuation and stopwords (no removal)
        return text

    def fit_tokenizer(self, texts):
        self.tokenizer = Tokenizer(num_words=self.vocab_size, oov_token=self.oov_token)
        self.tokenizer.fit_on_texts(texts)
        return self.tokenizer

    def texts_to_padded_sequences(self, texts):
        if self.tokenizer is None:
            raise ValueError('Tokenizer not fitted. Call fit_tokenizer first or use load_and_prep_data.')
        sequences = self.tokenizer.texts_to_sequences(texts)
        padded = pad_sequences(sequences, maxlen=self.max_length, padding='post', truncating='post')
        return padded

    def load_and_prep_data(self, filepath, text_col='review_text', label_col='label', test_size=0.15, val_size=0.15, random_state=42):
        df = pd.read_csv(filepath)
        # Clean texts
        df[text_col] = df[text_col].astype(str).map(self.clean_text)

        X = df[text_col].tolist()
        y = df[label_col].astype(int).values

        # Fit tokenizer on full data
        self.fit_tokenizer(X)
        X_padded = self.texts_to_padded_sequences(X)

        # First split off test set
        X_temp, X_test, y_temp, y_test = train_test_split(X_padded, y, test_size=test_size, random_state=random_state, stratify=y)
        # Now split remaining into train and val. Compute val proportion relative to the temp set.
        val_relative = val_size / (1 - test_size)
        X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_relative, random_state=random_state, stratify=y_temp)

        return X_train, X_val, X_test, y_train, y_val, y_test, self.tokenizer


In [None]:
# Example usage (uncomment and set `path` to your CSV to run):
# processor = TaglishProcessor(vocab_size=5000, max_length=100)
# X_train, X_val, X_test, y_train, y_val, y_test, tokenizer = processor.load_and_prep_data('data/taglish_reviews.csv')
# print('Shapes:', X_train.shape, X_val.shape, X_test.shape)
