In [None]:
pip install pandas numpy scikit-learn tensorflow gensim transformers datasets


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import pandas
import numpy
import sklearn
import tensorflow
import gensim
import transformers
import datasets

print("All libraries imported successfully!")


All libraries imported successfully!


In [None]:
"""
Baseline Telugu News Classification using Word2Vec + ML/DL Models
Models: MLP, CNN, LSTM, BiLSTM
Dataset: train and test CSVs in telugu_news folder
"""

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, Dense, LSTM, Bidirectional, Conv1D,
                                     GlobalMaxPooling1D, Dropout, Flatten)
from gensim.models import Word2Vec

# ------------------ Config ------------------
TRAIN_PATH = "/content/train_telugu_news.csv"
TEST_PATH = "/content/test_telugu_news.csv"
MAX_VOCAB = 20000
MAX_LEN = 200
EMBEDDING_DIM = 100
EPOCHS = 5
BATCH_SIZE = 32
# --------------------------------------------

# 1) Read Dataset and Combine Text
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

def merge_text(row):
    return f"{str(row['heading'])} - {str(row['body'])}"

train_df["text"] = train_df.apply(merge_text, axis=1)
test_df["text"] = test_df.apply(merge_text, axis=1)

# Encode labels
le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df["topic"])
test_df["label"] = le.transform(test_df["topic"])

NUM_CLASSES = len(le.classes_)
print("Classes:", list(le.classes_))

# 2) Tokenization and Padding
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["text"])

train_seq = tokenizer.texts_to_sequences(train_df["text"])
test_seq = tokenizer.texts_to_sequences(test_df["text"])

x_train = pad_sequences(train_seq, maxlen=MAX_LEN)
x_test = pad_sequences(test_seq, maxlen=MAX_LEN)
y_train = train_df["label"].values
y_test = test_df["label"].values

# 3) Train Word2Vec for embeddings
sentences = [text.split() for text in train_df["text"]]
w2v_model = Word2Vec(sentences, vector_size=EMBEDDING_DIM, window=5, min_count=1, workers=4)

# Build embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((MAX_VOCAB, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < MAX_VOCAB:
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
        else:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM,))

# 4) Define Models

def build_mlp():
    model = Sequential([
        Embedding(MAX_VOCAB, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
        Flatten(),
        Dense(128, activation="relu"),
        Dropout(0.3),
        Dense(NUM_CLASSES, activation="softmax")
    ])
    return model

def build_cnn():
    model = Sequential([
        Embedding(MAX_VOCAB, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
        Conv1D(128, 5, activation="relu"),
        GlobalMaxPooling1D(),
        Dense(128, activation="relu"),
        Dense(NUM_CLASSES, activation="softmax")
    ])
    return model

def build_lstm():
    model = Sequential([
        Embedding(MAX_VOCAB, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
        LSTM(128),
        Dense(128, activation="relu"),
        Dense(NUM_CLASSES, activation="softmax")
    ])
    return model

def build_bilstm():
    model = Sequential([
        Embedding(MAX_VOCAB, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
        Bidirectional(LSTM(128)),
        Dense(128, activation="relu"),
        Dense(NUM_CLASSES, activation="softmax")
    ])
    return model

# 5) Train and Evaluate Models
models = {
    "MLP": build_mlp(),
    "CNN": build_cnn(),
    "LSTM": build_lstm(),
    "BiLSTM": build_bilstm()
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name} model...")
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    model.fit(x_train, y_train, validation_split=0.1, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

    print(f"Evaluating {name}...")
    y_pred = np.argmax(model.predict(x_test), axis=1)

    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=0)

    results[name] = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

# 6) Print Summary Table
print("\n\n=== Final Results Summary ===")
for model, metrics in results.items():
    print(f"{model}: {metrics}")


Classes: ['business', 'editorial', 'entertainment', 'nation', 'sports']





Training MLP model...
Epoch 1/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 38ms/step - accuracy: 0.7601 - loss: 0.8099 - val_accuracy: 0.8562 - val_loss: 0.4243
Epoch 2/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 31ms/step - accuracy: 0.8905 - loss: 0.3175 - val_accuracy: 0.8707 - val_loss: 0.3918
Epoch 3/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 32ms/step - accuracy: 0.9210 - loss: 0.2253 - val_accuracy: 0.8776 - val_loss: 0.3992
Epoch 4/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 33ms/step - accuracy: 0.9358 - loss: 0.1730 - val_accuracy: 0.8759 - val_loss: 0.4520
Epoch 5/5
[1m487/487[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 32ms/step - accuracy: 0.9495 - loss: 0.1394 - val_accuracy: 0.8764 - val_loss: 0.4157
Evaluating MLP...
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step

MLP Classification Report:
               precision    recall  f1-sc