# Deep Unidirectional RNN Model using TF-IDF with unigram+bigram

In [44]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Input, Dense, Dropout, SimpleRNN
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_auc_score, roc_curve

In [45]:
# Load data
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

In [46]:
# TF-IDF with unigram + bigram
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=3000)
X_train_tfidf = tfidf.fit_transform(df_train['processed_tweet'])
X_test_tfidf = tfidf.transform(df_test['processed_tweet'])

Creating TF-IDF features...


In [47]:
# Convert to dense arrays
X_train = X_train_tfidf.astype('float32').toarray()
X_test = X_test_tfidf.astype('float32').toarray()

In [48]:
X_train = X_train.reshape((X_train.shape[0], 30, 100))
X_test = X_test.reshape((X_test.shape[0], 30, 100))

In [49]:
# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(df_train['sentiment_label'])
y_test = le.transform(df_test['sentiment_label'])
num_classes = len(le.classes_)
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

In [50]:
model = load_model("deep_unirnn_tfidf.h5")



In [51]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_auc_score

# Load data
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

# TF-IDF vectorization
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf.fit_transform(df_train['processed_tweet'])
X_test_tfidf = tfidf.transform(df_test['processed_tweet'])

# Convert to 3D arrays (30x100 shape)
X_train = X_train_tfidf.astype('float32').toarray().reshape((-1, 30, 100))
X_test = X_test_tfidf.astype('float32').toarray().reshape((-1, 30, 100))

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(df_train['sentiment_label'])
y_test = le.transform(df_test['sentiment_label'])
num_classes = len(le.classes_)

y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5, verbose=1)


Creating TF-IDF features...


In [52]:
print("Training Iteration 1 Model...")

model1 = Sequential([
    Input(shape=(30, 100)),
    SimpleRNN(128, return_sequences=True),
    Dropout(0.3),
    SimpleRNN(64),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model1.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])

history1 = model1.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Training Iteration 1 Model...
Epoch 1/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 17ms/step - accuracy: 0.5955 - loss: 1.0999 - val_accuracy: 0.6452 - val_loss: 0.9509 - learning_rate: 0.0010
Epoch 2/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 15ms/step - accuracy: 0.6421 - loss: 0.9660 - val_accuracy: 0.6625 - val_loss: 0.9141 - learning_rate: 0.0010
Epoch 3/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 15ms/step - accuracy: 0.6519 - loss: 0.9438 - val_accuracy: 0.6631 - val_loss: 0.9122 - learning_rate: 0.0010
Epoch 4/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 14ms/step - accuracy: 0.6565 - loss: 0.9349 - val_accuracy: 0.6610 - val_loss: 0.9175 - learning_rate: 0.0010
Epoch 5/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 14ms/step - accuracy: 0.6589 - loss: 0.9265 - val_accuracy: 0.6679 - val_loss: 0.9052 - learning_rate: 0.0010
Epoch 6/20
[1m4514/

In [53]:
from sklearn.metrics import f1_score, roc_auc_score, classification_report
import numpy as np

# Predict probabilities and labels using Iteration 1 model
y_pred_prob = model1.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

# AUC (macro)
try:
    auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
except:
    auc = "Error calculating AUC"

# Classification report
report = classification_report(y_true, y_pred, target_names=le.classes_, output_dict=True)

# Assuming 2 classes: le.classes_[0] = Negative, le.classes_[1] = Positive
f1_negative = report[le.classes_[0]]["f1-score"]
f1_positive = report[le.classes_[1]]["f1-score"]

# Output
print(f"AUC = {auc:.4f}" if isinstance(auc, float) else f"AUC = {auc}")
print(f"F1 Score for Positive = {f1_positive:.4f}")
print(f"F1 Score for Negative = {f1_negative:.4f}")


[1m2821/2821[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step
AUC = 0.8711
F1 Score for Positive = 0.3994
F1 Score for Negative = 0.3189


In [56]:
test_accuracy = accuracy_score(y_true, y_pred)
print(f"✅ Accuracy (Test): {test_accuracy:.4f}")


✅ Accuracy (Test): 0.7142


In [54]:
print("Training Iteration 2 Model...")

model2 = Sequential([
    Input(shape=(30, 100)),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    BatchNormalization(),
    LSTM(64),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model2.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])

# Remove EarlyStopping to ensure full 20 epochs
history2 = model2.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=64,
    callbacks=[reduce_lr],  # Optional: keep ReduceLROnPlateau if helpful
    verbose=1
)


Training Iteration 2 Model...
Epoch 1/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 39ms/step - accuracy: 0.5757 - loss: 1.1730 - val_accuracy: 0.6043 - val_loss: 1.0746 - learning_rate: 0.0010
Epoch 2/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 37ms/step - accuracy: 0.6131 - loss: 1.0556 - val_accuracy: 0.6188 - val_loss: 1.0351 - learning_rate: 0.0010
Epoch 3/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 37ms/step - accuracy: 0.6265 - loss: 1.0215 - val_accuracy: 0.6327 - val_loss: 1.0060 - learning_rate: 0.0010
Epoch 4/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 37ms/step - accuracy: 0.6415 - loss: 0.9869 - val_accuracy: 0.6364 - val_loss: 0.9824 - learning_rate: 0.0010
Epoch 5/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 38ms/step - accuracy: 0.6498 - loss: 0.9651 - val_accuracy: 0.6572 - val_loss: 0.9388 - learning_rate: 0.0010
Epoch 6/20
[1m

In [63]:
from sklearn.metrics import f1_score, roc_auc_score, classification_report
import numpy as np

# Predict probabilities and labels using Iteration 1 model
y_pred_prob = model1.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

# AUC (macro)
try:
    auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
except:
    auc = "Error calculating AUC"

# Classification report
report = classification_report(y_true, y_pred, target_names=le.classes_, output_dict=True)

# Assuming 2 classes: le.classes_[0] = Negative, le.classes_[1] = Positive
f1_negative = report[le.classes_[0]]["f1-score"]
f1_positive = report[le.classes_[1]]["f1-score"]

# Output
print(f"AUC = {auc:.4f}" if isinstance(auc, float) else f"AUC = {auc}")
print(f"F1 Score for Positive = {f1_positive:.4f}")
print(f"F1 Score for Negative = {f1_negative:.4f}")

[1m2821/2821[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 13ms/step
AUC = 0.8538
F1 Score for Positive = 0.3573
F1 Score for Negative = 0.2774


In [59]:
test_accuracy = accuracy_score(y_true, y_pred)
print(f"✅ Accuracy (Test): {test_accuracy:.4f}")

✅ Accuracy (Test): 0.7142


In [57]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical

# ======= Load and Prepare Data =======
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

# TF-IDF vectorization (improved)
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf.fit_transform(df_train['processed_tweet'])
X_test_tfidf = tfidf.transform(df_test['processed_tweet'])

X_train = X_train_tfidf.astype('float32').toarray().reshape((-1, 50, 100))
X_test = X_test_tfidf.astype('float32').toarray().reshape((-1, 50, 100))

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(df_train['sentiment_label'])
y_test = le.transform(df_test['sentiment_label'])

num_classes = len(le.classes_)
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# ======= Callbacks =======
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5, verbose=1)

# ======= Build Model =======
print("Training Improved Iteration 1 Model...")

model1 = Sequential([
    Input(shape=(50, 100)),
    LSTM(128, return_sequences=True, recurrent_dropout=0.2),
    BatchNormalization(),
    Dropout(0.3),
    LSTM(64, recurrent_dropout=0.2),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# Compile with label smoothing
loss_fn = CategoricalCrossentropy(label_smoothing=0.1)
model1.compile(loss=loss_fn, optimizer=Adam(0.001), metrics=['accuracy'])

# Train
history1 = model1.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# Save
model1.save("improved_iteration1_lstm_tfidf.h5")


Training Improved Iteration 1 Model...
Epoch 1/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 66ms/step - accuracy: 0.5264 - loss: 1.3644 - val_accuracy: 0.5926 - val_loss: 1.2120 - learning_rate: 0.0010
Epoch 2/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 67ms/step - accuracy: 0.5900 - loss: 1.2218 - val_accuracy: 0.5789 - val_loss: 1.2251 - learning_rate: 0.0010
Epoch 3/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 69ms/step - accuracy: 0.6008 - loss: 1.1913 - val_accuracy: 0.6159 - val_loss: 1.1709 - learning_rate: 0.0010
Epoch 4/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m298s[0m 66ms/step - accuracy: 0.6136 - loss: 1.1673 - val_accuracy: 0.6289 - val_loss: 1.1371 - learning_rate: 0.0010
Epoch 5/20
[1m4514/4514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 67ms/step - accuracy: 0.6218 - loss: 1.1517 - val_accuracy: 0.6389 - val_loss: 1.1167 - learning_rate: 0.0010
Epoch 



In [58]:
test_accuracy = accuracy_score(y_true, y_pred)
print(f"✅ Accuracy (Test): {test_accuracy:.4f}")

✅ Accuracy (Test): 0.7142


In [60]:
# =================== IMPORTS ===================
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical

# =================== DATA PREPROCESSING ===================
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

# TF-IDF Vectorization (Improved)
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf.fit_transform(df_train['processed_tweet'])
X_test_tfidf = tfidf.transform(df_test['processed_tweet'])

# Convert to arrays and reshape to (samples, 50, 100)
X_train = X_train_tfidf.astype('float32').toarray().reshape((-1, 50, 100))
X_test = X_test_tfidf.astype('float32').toarray().reshape((-1, 50, 100))

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(df_train['sentiment_label'])
y_test = le.transform(df_test['sentiment_label'])

num_classes = len(le.classes_)
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# =================== CALLBACKS ===================
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5, verbose=1)

# =================== BUILD MODEL ===================
print("Training Iteration 2 Model...")

model2 = Sequential([
    Input(shape=(50, 100)),
    Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.2)),
    Dropout(0.3),
    BatchNormalization(),
    Bidirectional(LSTM(64, recurrent_dropout=0.2)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# Compile with Label Smoothing
loss_fn = CategoricalCrossentropy(label_smoothing=0.1)
model2.compile(optimizer=Adam(0.001), loss=loss_fn, metrics=['accuracy'])

# =================== TRAIN ===================
history2 = model2.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32,  # smaller batch size for more updates
    callbacks=[reduce_lr],
    verbose=1
)

Training Iteration 2 Model...
Epoch 1/20
[1m9027/9027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m662s[0m 72ms/step - accuracy: 0.5757 - loss: 1.2559 - val_accuracy: 0.6136 - val_loss: 1.1674 - learning_rate: 0.0010
Epoch 2/20
[1m9027/9027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m648s[0m 72ms/step - accuracy: 0.6119 - loss: 1.1732 - val_accuracy: 0.6306 - val_loss: 1.1302 - learning_rate: 0.0010
Epoch 3/20
[1m9027/9027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m705s[0m 78ms/step - accuracy: 0.6270 - loss: 1.1441 - val_accuracy: 0.6468 - val_loss: 1.1041 - learning_rate: 0.0010
Epoch 4/20
[1m9027/9027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m744s[0m 78ms/step - accuracy: 0.6367 - loss: 1.1264 - val_accuracy: 0.6510 - val_loss: 1.1098 - learning_rate: 0.0010
Epoch 5/20
[1m9027/9027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.6441 - loss: 1.1151
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m902

In [61]:
test_accuracy = accuracy_score(y_true, y_pred)
print(f"✅ Accuracy (Test): {test_accuracy:.4f}")

✅ Accuracy (Test): 0.7142
