# Leveraging NLP and ML for Sentiment Analysis of Informal Bangla Text
## Dataset Details

### Dataset Characteristics
- **Size**: 3,149 comments
- **Source**: Web-scraped from multiple social media platforms
- **Language**: Informal Bangla text (noisy, with dialects and grammatical errors)
- **Labeling**: Manually labeled by psychology students

### Class Distribution
| Sentiment | Number of Comments | Percentage |
|-----------|-------------------|------------|
| Positive  | 1,197             | 38%        |
| Negative  | 1,213             | 38.5%      |
| Neutral   | 739               | 23.5%      |

### Data Columns
- **Comment**: Raw Bangla text from social media
- **Sentiment**: Three classes (Positive, Negative, Neutral)

## Best Performing Model

### Model Architecture
**LSTM (Long Short-Term Memory) with Hyperparameter Tuning**

### Hyperparameters
| Parameter | Value |
|-----------|-------|
| Embedding Dimension | 150 |
| LSTM Units | 150 |
| Dropout Rate | 0.2 |
| Optimizer | RMSprop |
| Batch Size | 32 |
| Epochs | 10 |

### Performance Results
| Dataset Type | Accuracy |
|--------------|----------|
| **Informal Bangla Text** | **80.3%** |
| Formal Bangla Text | 96.9% |


# Cell 1 - Imports

In [None]:
import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import pickle


# Load dataset


In [None]:
# Load your dataset
df = pd.read_csv('/kaggle/input/final-dataset/final-dataset.csv')
  # Replace with your file path

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nSentiment distribution:")
print(df['Polarity'].value_counts())
print(f"\nSample data:")
print(df.head())

# Cell 3: Data Preprocessing - Cleaning


In [None]:
def clean_bangla_text(text):
    """Clean Bangla text according to the paper's preprocessing steps"""
    if pd.isna(text):
        return ""
    
    # Convert to string
    text = str(text)
    
    # Remove non-Bangla characters (keeping Bangla Unicode range)
    # Bangla Unicode range: \u0980-\u09FF
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading/trailing spaces
    text = text.strip()
    
    return text

# Apply cleaning
df['Text_cleaned'] = df['Text'].apply(clean_bangla_text)

# Remove empty texts after cleaning
df = df[df['Text_cleaned'].str.len() > 0]

print(f"Dataset shape after cleaning: {df.shape}")

# Cell 4: Label Encoding


In [None]:
# Initialize label encoder
label_encoder = LabelEncoder()

# Fit and transform labels
df['Polarity_encoded'] = label_encoder.fit_transform(df['Polarity'])

# Display label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:")
for label, encoded in label_mapping.items():
    print(f"{label} -> {encoded}")

# Cell 5: Handle Class Imbalance using SMOTE


In [None]:
# First, we need to vectorize the text for SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectors for SMOTE
tfidf_temp = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_temp.fit_transform(df['Text_cleaned']).toarray()
y = df['Polarity_encoded'].values

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

print(f"Original dataset shape: {X_tfidf.shape}")
print(f"Resampled dataset shape: {X_resampled.shape}")
print(f"Class distribution after SMOTE: {Counter(y_resampled)}")

# Cell 6: Prepare data for LSTM


In [None]:
# For LSTM, we'll use the original balanced approach from the paper
# Split the original data first
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['Text_cleaned'].values, 
    df['Polarity_encoded'].values, 
    test_size=0.2, 
    random_state=42,
    stratify=df['Polarity_encoded']
)

# Further split training data for validation
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_train_text, 
    y_train, 
    test_size=0.2, 
    random_state=42,
    stratify=y_train
)

print(f"Training set size: {len(X_train_text)}")
print(f"Validation set size: {len(X_val_text)}")
print(f"Test set size: {len(X_test_text)}")

# Cell 7: Tokenization and Padding


In [None]:
# Hyperparameters from the paper
max_features = 10000  # Maximum number of words to keep
max_length = 100      # Maximum sequence length

# Initialize tokenizer
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train_text)

# Convert texts to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_val_seq = tokenizer.texts_to_sequences(X_val_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

print(f"Padded training shape: {X_train_pad.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")

# Cell 8: Convert labels to categorical


In [None]:
num_classes = 3
y_train_cat = to_categorical(y_train, num_classes)
y_val_cat = to_categorical(y_val, num_classes)
y_test_cat = to_categorical(y_test, num_classes)

# Cell 9: Build LSTM Model (with hyperparameters from the paper)


In [None]:
# Hyperparameters from Table 5 in the paper
embedding_dim = 150
lstm_units = 150
dropout_rate = 0.2

# Build model
model = Sequential([
    Embedding(input_dim=max_features, 
              output_dim=embedding_dim, 
              input_length=max_length),
    LSTM(lstm_units, return_sequences=False),
    Dropout(dropout_rate),
    Dense(num_classes, activation='softmax')
])

# Compile with RMSprop optimizer as mentioned in the paper
model.compile(
    optimizer=RMSprop(),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# Cell 10: Train the Model


In [None]:
# Train the model
history = model.fit(
    X_train_pad, y_train_cat,
    batch_size=32,
    epochs=10,
    validation_data=(X_val_pad, y_val_cat),
    verbose=1
)

# Cell 11: Plot Training History


In [None]:
# Plot accuracy
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Cell 12: Evaluate on Test Set


In [None]:
# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test_cat)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# Cell 13: Get Detailed Predictions


In [None]:
# Get predictions
y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)

# Convert back to original labels
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)

# Create classification report
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test_labels, y_pred_labels))

# Confusion Matrix
cm = confusion_matrix(y_test_labels, y_pred_labels)
print("\nConfusion Matrix:")
print(cm)

# Cell 14: Save the Model and Components


In [None]:
# Save model in HDF5 format (as mentioned in paper)
model.save('bangla_sentiment_lstm_model.h5')

# Save model in SavedModel format
model.save('bangla_sentiment_lstm_model')

# Save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save label encoder
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and components saved successfully!")

# Cell 15: Function to Predict New Text


In [None]:
def predict_sentiment(text, model, tokenizer, label_encoder, max_length=100):
    """Predict sentiment for new Bangla text"""
    # Clean the text
    cleaned_text = clean_bangla_text(text)
    
    # Convert to sequence
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    
    # Pad sequence
    padded = pad_sequences(sequence, maxlen=max_length)
    
    # Predict
    prediction = model.predict(padded)
    predicted_class = np.argmax(prediction, axis=1)[0]
    
    # Get confidence scores
    confidence_scores = prediction[0]
    
    # Convert to label
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    
    return {
        'text': text,
        'cleaned_text': cleaned_text,
        'predicted_sentiment': predicted_label,
        'confidence_scores': {
            label: float(confidence_scores[i]) 
            for i, label in enumerate(label_encoder.classes_)
        }
    }

# Test the function
sample_text = "আপনার বাংলা টেক্সট এখানে লিখুন"
result = predict_sentiment(sample_text, model, tokenizer, label_encoder)
print(result)