In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e:
        print("Memory growth setting failed:", e)
else:
    print("No GPU detected — using CPU (slower).")

GPU memory growth enabled.


In [None]:
!pip install -q scikit-learn


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import resample, shuffle
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, SpatialDropout1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [None]:
data_path = "/content/drive/MyDrive/Assamese/Assamese_Sentiments_70k.csv"
data = pd.read_csv(data_path)

# Validate dataset columns
assert 'Assamese Text' in data.columns and 'Sentiments' in data.columns, \
    "Dataset must have 'Assamese Text' and 'Sentiments' columns."

In [None]:
def clean_text(text):
    text = re.sub('[^\u0980-\u09ff\s]', '', str(text))  # Keep Assamese chars
    text = re.sub('\s+', ' ', text).strip()             # Remove extra spaces
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)          # Reduce repeated chars
    text = ' '.join([w for w in text.split() if len(w) > 1])  # Remove single letters
    return text

data['Assamese Text'] = data['Assamese Text'].apply(clean_text)
data = data[data['Assamese Text'].str.strip() != '']

In [None]:
max_count = data['Sentiments'].value_counts().max()

balanced_data = pd.concat([
    resample(group, replace=True, n_samples=max_count, random_state=42)
    for _, group in data.groupby('Sentiments')
]).reset_index(drop=True)

balanced_data = shuffle(balanced_data, random_state=42)

In [None]:
train, test = train_test_split(
    balanced_data,
    test_size=0.2,
    random_state=42,
    stratify=balanced_data['Sentiments']
)

In [None]:
max_features = 30000  # vocabulary size
lengths = train['Assamese Text'].apply(lambda x: len(x.split()))
maxlen = int(lengths.quantile(0.95))  # 95% length coverage

tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(train['Assamese Text'].values)

X_train = pad_sequences(tokenizer.texts_to_sequences(train['Assamese Text']), maxlen=maxlen)
X_test = pad_sequences(tokenizer.texts_to_sequences(test['Assamese Text']), maxlen=maxlen)

In [None]:
label_order = ['Negative', 'Positive', 'Neutral']
Y_train = pd.get_dummies(train['Sentiments'])[label_order].values
Y_test = pd.get_dummies(test['Sentiments'])[label_order].values

# Convert to float32
X_train = np.array(X_train, dtype=np.float32)
Y_train = np.array(Y_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)
Y_test = np.array(Y_test, dtype=np.float32)

# Safety check
assert X_train.shape[0] > 0 and Y_train.shape[0] > 0, "Training data is empty!"
assert X_test.shape[0] > 0 and Y_test.shape[0] > 0, "Testing data is empty!"

print(f"Train shapes: X={X_train.shape}, Y={Y_train.shape}")
print(f"Test shapes: X={X_test.shape}, Y={Y_test.shape}")

Train shapes: X=(103824, 18), Y=(103824, 3)
Test shapes: X=(25956, 18), Y=(25956, 3)


In [None]:
embed_dim = 200

model = Sequential([
    Embedding(input_dim=max_features, output_dim=embed_dim, input_length=maxlen),
    SpatialDropout1D(0.4),
    Bidirectional(LSTM(256, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)),
    Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.4)),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()



In [None]:
checkpoint_path = "/content/drive/MyDrive/Assamese_Sentiment/best_model_optimized.keras"

checkpoint = ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

early_stop = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)

history = model.fit(
    X_train, Y_train,
    epochs=45,
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stop, checkpoint],
    verbose=1
)


Epoch 1/45
[1m649/649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 235ms/step - accuracy: 0.3556 - loss: 1.0899
Epoch 1: val_accuracy improved from -inf to 0.51360, saving model to /content/drive/MyDrive/Assamese_Sentiment/best_model_optimized.keras
[1m649/649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 258ms/step - accuracy: 0.3557 - loss: 1.0899 - val_accuracy: 0.5136 - val_loss: 0.9453
Epoch 2/45
[1m649/649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step - accuracy: 0.5713 - loss: 0.8668
Epoch 2: val_accuracy improved from 0.51360 to 0.64286, saving model to /content/drive/MyDrive/Assamese_Sentiment/best_model_optimized.keras
[1m649/649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 253ms/step - accuracy: 0.5714 - loss: 0.8668 - val_accuracy: 0.6429 - val_loss: 0.7647
Epoch 3/45
[1m649/649[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 240ms/step - accuracy: 0.7156 - loss: 0.6375
Epoch 3: val_accuracy improved from 0.64286 t

In [None]:
Y_pred = np.argmax(model.predict(X_test, batch_size=128), axis=-1)
Y_true = np.argmax(Y_test, axis=-1)

print("\nConfusion Matrix:\n", confusion_matrix(Y_true, Y_pred))
print("\nClassification Report:\n", classification_report(Y_true, Y_pred, target_names=label_order, digits=4))

[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 48ms/step

Confusion Matrix:
 [[7721  205  726]
 [ 812 5476 2364]
 [1191 1604 5857]]

Classification Report:
               precision    recall  f1-score   support

    Negative     0.7940    0.8924    0.8403      8652
    Positive     0.7517    0.6329    0.6872      8652
     Neutral     0.6546    0.6770    0.6656      8652

    accuracy                         0.7341     25956
   macro avg     0.7334    0.7341    0.7310     25956
weighted avg     0.7334    0.7341    0.7310     25956

