# Telugu Text Classification

This notebook demonstrates a text classification pipeline for Telugu text using TensorFlow/Keras and Hugging Face Transformers. The dataset appears to be categorized into labels such as "positive" and "negative" or similar categories.

Steps included:
1. Data loading and preprocessing
2. Tokenization using Hugging Face tokenizer
3. Creating TensorFlow datasets
4. Model creation with Transformers + Keras
5. Training with callbacks and saving the model

Note: Paths and dataset specifics may need to be adjusted depending on your environment.

In [1]:
import json
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf

from transformers import AutoTokenizer, TFAutoModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

print("Libraries imported")

In [2]:
# Set seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("Seed set")

In [3]:
# Load dataset (update path as needed)
data_path = "telugu_dataset.csv"

if os.path.exists(data_path):
    df = pd.read_csv(data_path)
else:
    # Example dataframe structure if file not present
    df = pd.DataFrame({
        'text': ["నేను సంతోషంగా ఉన్నాను", "నాకు బాధగా ఉంది", "ఇది అద్భుతమైన వనరు"],
        'label': ["positive", "negative", "positive"]
    })

df.head()

In [4]:
# Encode labels
label2id = {label: idx for idx, label in enumerate(df['label'].unique())}
id2label = {v: k for k, v in label2id.items()}

df['label_id'] = df['label'].map(label2id)
df.head()

In [5]:
# Train-test split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['label_id'])

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

In [6]:
# Tokenizer and model selection
MODEL_NAME = "xlm-roberta-base"  # or a Telugu-specific model if available
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = TFAutoModel.from_pretrained(MODEL_NAME)

MAX_LEN = 128

print("Tokenizer and model loaded")

In [7]:
def encode_texts(texts, tokenizer, max_len=128):
    encodings = tokenizer(texts.tolist(), truncation=True, padding='max_length', max_length=max_len)
    return np.array(encodings['input_ids']), np.array(encodings['attention_mask'])

train_input_ids, train_attention_mask = encode_texts(train_df['text'], tokenizer, MAX_LEN)
val_input_ids, val_attention_mask = encode_texts(val_df['text'], tokenizer, MAX_LEN)

train_labels = train_df['label_id'].values
val_labels = val_df['label_id'].values

print("Data encoded")

In [8]:
BATCH_SIZE = 8

train_dataset = tf.data.Dataset.from_tensor_slices(((train_input_ids, train_attention_mask), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices(((val_input_ids, val_attention_mask), val_labels))

train_dataset = train_dataset.shuffle(100).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print("Datasets created")

In [9]:
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model

def create_model(base_model, max_len=128, num_labels=2):
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    
    outputs = base_model(input_ids, attention_mask=attention_mask)
    pooled_output = outputs[0][:, 0, :]  # take <s> token (or [CLS] equivalent)
    x = Dropout(0.3)(pooled_output)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.2)(x)
    logits = Dense(num_labels, activation="softmax")(x)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=logits)
    return model

model = create_model(base_model, MAX_LEN, num_labels=len(label2id))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

In [10]:
checkpoint_path = "telugu_text_classifier.h5"
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, save_weights_only=True, verbose=1)
earlystopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(train_dataset,
                    validation_data=val_dataset,
                    epochs=3,
                    callbacks=[checkpoint, earlystopping])

In [11]:
model.save("telugu_text_classifier_full")
print("Model saved")

## Inference example

Use the tokenizer and the saved model for inference. Remember to load weights and ensure tokenization settings match training.

In [12]:
def predict(texts, tokenizer, model, max_len=128):
    input_ids, attention_mask = encode_texts(pd.Series(texts), tokenizer, max_len)
    preds = model.predict([input_ids, attention_mask])
    return [id2label[np.argmax(p)] for p in preds]

# Example
sample_texts = ["నేను చాలా సంతోషంగా ఉన్నాను", "ఈ విషయం నన్ను బాధపెడుతుంది"]
print(predict(sample_texts, tokenizer, model, MAX_LEN))