# Block 1: Environment Setup

In [5]:
import tensorflow as tf
import numpy as np
import json
import os
from google.colab import files
from tqdm import tqdm

#FILES UPPLOAD

In [6]:

required_files = ['hybrid_embeddings.npy', 'augmented.json', 'vocabulary.json']
missing_files = [f for f in required_files if not os.path.exists(f)]
if missing_files: uploaded = files.upload()

#LOAD DATA

In [None]:
embedding_matrix = np.load('hybrid_embeddings.npy')
with open("vocabulary.json", "r") as f: word2idx = json.load(f)["word2idx"]
with open('augmented.json', 'r') as f: data = json.load(f)

#PREPROCESS LABELS

In [7]:
label_mapping = {-1: 0, 0: 1, 1: 2}
labels = [label_mapping[sent['sentiment']] for sent in data]
y = tf.keras.utils.to_categorical(labels, num_classes=3)

#PAD SEQUENCES

In [8]:
max_len = max(len(sent['sentence']) for sent in data)
padded_sequences = []
for sent in data:
    seq = [word2idx.get(word, 0) for word in sent['sentence']]
    padded_sequences.append(seq[:max_len] + [0]*(max_len - len(seq)))
X = np.array(padded_sequences)



#LSTM LAYER

In [9]:
class EfficientLSTM(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.state_size = [units, units]
        self.output_size = units
    def build(self, input_shape):
        self.kernel = self.add_weight(shape=(input_shape[-1], self.units*4), initializer='glorot_uniform')
        self.recurrent_kernel = self.add_weight(shape=(self.units, self.units*4), initializer='orthogonal')
        self.bias = self.add_weight(shape=(self.units*4,), initializer='zeros')
    def call(self, inputs, states):
        h_prev, c_prev = states
        z = tf.matmul(inputs, self.kernel) + tf.matmul(h_prev, self.recurrent_kernel) + self.bias
        i, f, c_candidate, o = tf.split(z, 4, axis=1)
        c = tf.sigmoid(f)*c_prev + tf.sigmoid(i)*tf.tanh(c_candidate)
        h = tf.sigmoid(o) * tf.tanh(c)
        return h, [h, c]

#  Build Model

In [10]:

def create_model():
    inputs = tf.keras.Input(shape=(max_len,))
    x = tf.keras.layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
                                weights=[embedding_matrix], mask_zero=True, trainable=False)(inputs)
    x = tf.keras.layers.RNN(EfficientLSTM(64))(x)
    outputs = tf.keras.layers.Dense(3, activation='softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001, clipvalue=0.5),
                loss='categorical_crossentropy', metrics=['accuracy'])
    return model

#TRAIN AND PREDICT

In [12]:

model = create_model()
history = model.fit(X, y, batch_size=128, epochs=50,
                  validation_split=0.2, callbacks=[tf.keras.callbacks.EarlyStopping(patience=30)])


def predict(sentence):
    seq = [word2idx.get(word, 0) for word in sentence]
    seq = seq[:max_len] + [0]*(max_len - len(seq))
    proba = model.predict(np.array([seq]))[0]
    return ["Negative", "Neutral", "Positive"][np.argmax(proba)]

Epoch 1/50
[1m863/863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 47ms/step - accuracy: 0.4788 - loss: 1.0258 - val_accuracy: 0.4248 - val_loss: 1.0438
Epoch 2/50
[1m863/863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 43ms/step - accuracy: 0.5106 - loss: 0.9734 - val_accuracy: 0.4489 - val_loss: 1.0849
Epoch 3/50
[1m863/863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 42ms/step - accuracy: 0.5234 - loss: 0.9493 - val_accuracy: 0.4970 - val_loss: 1.0159
Epoch 4/50
[1m863/863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 45ms/step - accuracy: 0.5291 - loss: 0.9359 - val_accuracy: 0.4611 - val_loss: 1.0752
Epoch 5/50
[1m863/863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.5342 - loss: 0.9234 - val_accuracy: 0.4834 - val_loss: 1.0556
Epoch 6/50
[1m863/863[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 44ms/step - accuracy: 0.5369 - loss: 0.9151 - val_accuracy: 0.5044 - val_loss: 1.0522
Epoch 7/50
[1m8

In [6]:

import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

embedding_matrix = np.load('hybrid_embeddings.npy')
with open("vocabulary.json", "r") as f: word2idx = json.load(f)["word2idx"]
with open('augmented.json', 'r') as f: data = json.load(f)

word_to_vec = {word: embedding_matrix[idx] for word, idx in word2idx.items() if idx < embedding_matrix.shape[0]}

X = []
for sent in data:
    vecs = [word_to_vec[word] for word in sent['sentence'] if word in word_to_vec]
    avg_vec = np.mean(vecs, axis=0) if vecs else np.zeros(embedding_matrix.shape[1])
    X.append(avg_vec)
X = np.array(X)

label_mapping = {-1: 0, 0: 1, 1: 2}
y = np.array([label_mapping[sent['sentiment']] for sent in data])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=200, max_depth=20, class_weight='balanced', n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=['Negative (-1)', 'Neutral (0)', 'Positive (1)']))

sample_sentence = data[0]['sentence']
sample_vec = np.mean([word_to_vec[word] for word in sample_sentence if word in word_to_vec], axis=0)
print("\nSample Prediction:", ['Negative', 'Neutral', 'Positive'][rf.predict([sample_vec])[0]])



Accuracy: 0.4830
               precision    recall  f1-score   support

Negative (-1)       0.29      0.12      0.17      6553
  Neutral (0)       0.44      0.60      0.51     11136
 Positive (1)       0.60      0.60      0.60      9926

     accuracy                           0.48     27615
    macro avg       0.44      0.44      0.42     27615
 weighted avg       0.46      0.48      0.46     27615


Sample Prediction: Neutral
