In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e:
        print("Memory growth setting failed:", e)
else:
    print("No GPU detected — using CPU (slower).")

GPU memory growth enabled.


In [None]:
!pip install -q scikit-learn


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import resample, shuffle
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, SpatialDropout1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [None]:
data_path = "/content/drive/MyDrive/Colab Notebooks/Assamese Sentiments file.csv (1).zip"
data = pd.read_csv(data_path)

# Validate dataset columns
assert 'Assamese Text' in data.columns and 'Sentiments' in data.columns, \
    "Dataset must have 'Assamese Text' and 'Sentiments' columns."

In [None]:
def clean_text(text):
    text = re.sub('[^\u0980-\u09ff\s]', '', str(text))  # Keep Assamese chars
    text = re.sub('\s+', ' ', text).strip()             # Remove extra spaces
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)          # Reduce repeated chars
    text = ' '.join([w for w in text.split() if len(w) > 1])  # Remove single letters
    return text

data['Assamese Text'] = data['Assamese Text'].apply(clean_text)
data = data[data['Assamese Text'].str.strip() != '']

In [None]:
max_count = data['Sentiments'].value_counts().max()

balanced_data = pd.concat([
    resample(group, replace=True, n_samples=max_count, random_state=42)
    for _, group in data.groupby('Sentiments')
]).reset_index(drop=True)

balanced_data = shuffle(balanced_data, random_state=42)

In [None]:
train, test = train_test_split(
    balanced_data,
    test_size=0.2,
    random_state=42,
    stratify=balanced_data['Sentiments']
)

In [None]:
max_features = 30000  # vocabulary size
lengths = train['Assamese Text'].apply(lambda x: len(x.split()))
maxlen = int(lengths.quantile(0.95))  # 95% length coverage

tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(train['Assamese Text'].values)

X_train = pad_sequences(tokenizer.texts_to_sequences(train['Assamese Text']), maxlen=maxlen)
X_test = pad_sequences(tokenizer.texts_to_sequences(test['Assamese Text']), maxlen=maxlen)

In [None]:
label_order = ['Negative', 'Positive', 'Neutral']
Y_train = pd.get_dummies(train['Sentiments'])[label_order].values
Y_test = pd.get_dummies(test['Sentiments'])[label_order].values

# Convert to float32
X_train = np.array(X_train, dtype=np.float32)
Y_train = np.array(Y_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)
Y_test = np.array(Y_test, dtype=np.float32)

# Safety check
assert X_train.shape[0] > 0 and Y_train.shape[0] > 0, "Training data is empty!"
assert X_test.shape[0] > 0 and Y_test.shape[0] > 0, "Testing data is empty!"

print(f"Train shapes: X={X_train.shape}, Y={Y_train.shape}")
print(f"Test shapes: X={X_test.shape}, Y={Y_test.shape}")

Train shapes: X=(179352, 18), Y=(179352, 3)
Test shapes: X=(44838, 18), Y=(44838, 3)


In [None]:
embed_dim = 200

model = Sequential([
    Embedding(input_dim=max_features, output_dim=embed_dim, input_length=maxlen),
    SpatialDropout1D(0.4),
    Bidirectional(LSTM(256, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)),
    Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.4)),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()



In [None]:
model = Sequential([
    Embedding(input_dim=max_features, output_dim=128, input_length=maxlen),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(3, activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=0.001),
              metrics=['accuracy'])

In [None]:
checkpoint_path = "/content/drive/MyDrive/Assamese_Sentiment/best_model.keras"

checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                             monitor='val_accuracy',
                             save_best_only=True,
                             mode='max',
                             verbose=1)

early_stop = EarlyStopping(monitor='val_loss',
                           patience=2000,
                           restore_best_weights=True)

In [None]:
history = model.fit(
    X_train, Y_train,
    epochs=16,
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stop, checkpoint],
    verbose=1
)

Epoch 1/16
[1m1120/1121[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.8843 - loss: 0.2574
Epoch 1: val_accuracy did not improve from 0.81403
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 16ms/step - accuracy: 0.8843 - loss: 0.2574 - val_accuracy: 0.7829 - val_loss: 0.7436
Epoch 2/16
[1m1119/1121[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.8906 - loss: 0.2448
Epoch 2: val_accuracy did not improve from 0.81403
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.8906 - loss: 0.2448 - val_accuracy: 0.7925 - val_loss: 0.7536
Epoch 3/16
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8967 - loss: 0.2319
Epoch 3: val_accuracy did not improve from 0.81403
[1m1121/1121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - accuracy: 0.8967 - loss: 0.2319 - val_accuracy: 0.7939 - val_loss: 0.7711
Epoch 4/16
[

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(Y_test, axis=1)

label_names = ['Negative', 'Positive', 'Neutral']

print(classification_report(y_true_labels, y_pred_labels, target_names=label_names))

[1m1402/1402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step
              precision    recall  f1-score   support

    Negative       0.81      0.94      0.87     14946
    Positive       0.76      0.73      0.74     14946
     Neutral       0.75      0.66      0.70     14946

    accuracy                           0.78     44838
   macro avg       0.77      0.78      0.77     44838
weighted avg       0.77      0.78      0.77     44838



In [None]:
# Example test on custom sentences
custom_sentences = [
    "মোৰ আজি দিনটো খুব ভাল গৈছে।",      # Positive

]

# Preprocess
custom_seq = tokenizer.texts_to_sequences(custom_sentences)
custom_pad = pad_sequences(custom_seq, maxlen=maxlen)

# Predict
custom_preds = model.predict(custom_pad)
custom_labels = np.argmax(custom_preds, axis=1)

# Map to sentiment
for i, text in enumerate(custom_sentences):
    sentiment = label_names[custom_labels[i]]
    print(f"🔹 \"{text}\" → Sentiment: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
🔹 "মোৰ আজি দিনটো খুব ভাল গৈছে।" → Sentiment: Neutral
