<a href="https://colab.research.google.com/github/KaranTejwani/deep-learning-practise/blob/main/Sentiment_analysis_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
path = "/content/drive/MyDrive/datasets/sentiment140.csv"

In [3]:
import pandas as pd
cols = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(path, encoding="latin-1", names=cols)

In [4]:
df = df[df['target'].isin([0,4])].copy()
df['target'] = df['target'].map({0:0, 4:1})

In [5]:
df = df.sample(n=100000, random_state=42).reset_index(drop=True)
print("Sampled dataset size:", len(df))
print(df['target'].value_counts())

Sampled dataset size: 100000
target
1    50057
0    49943
Name: count, dtype: int64


In [6]:
import re, string
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_text'] = df['text'].map(clean_text)

In [7]:
from sklearn.model_selection import train_test_split
X = df['clean_text'].values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.20, stratify=y_train, random_state=42
)

print("Train / val / Test sizes:", len(X_train), len(X_val), len(X_test))


Train / val / Test sizes: 56000 14000 30000


In [8]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
vectorizer = TextVectorization(
    max_tokens=10000,
    output_mode="multi_hot",
    ngrams=1,
    sparse=True
)
vectorizer.adapt(tf.data.Dataset.from_tensor_slices(X_train).batch(256))

In [9]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(256)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(256)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(256)

In [10]:
train_ds = train_ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [11]:
from tensorflow import keras
from tensorflow.keras import layers

inputs = keras.Input(shape=(10000,), sparse=True, dtype=tf.float32)
# convert sparse to dense (this op is supported and avoids SparseFillEmptyRows in matmul path)
x = layers.Lambda(lambda s: tf.sparse.to_dense(s, default_value=0.0), output_shape=(10000,))(inputs)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

In [12]:
model.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [13]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10
)

Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 986ms/step - accuracy: 0.6782 - loss: 0.6134 - val_accuracy: 0.7784 - val_loss: 0.4846
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7798 - loss: 0.4786 - val_accuracy: 0.7825 - val_loss: 0.4672
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8014 - loss: 0.4395 - val_accuracy: 0.7851 - val_loss: 0.4673
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8173 - loss: 0.4186 - val_accuracy: 0.7844 - val_loss: 0.4718
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8281 - loss: 0.3999 - val_accuracy: 0.7822 - val_loss: 0.4756
Epoch 6/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8377 - loss: 0.3846 - val_accuracy: 0.7821 - val_loss: 0.4805
Epoch 7/10
[1m219/219[

In [14]:
from sklearn.metrics import accuracy_score

y_prob = model.predict(test_ds)
y_pred = (y_prob.flatten() > 0.5).astype(int)
acc = accuracy_score(y_test, y_pred)
print(acc)

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 151ms/step
0.7787333333333334


In [15]:
X2 = df['clean_text'].values
y2 = df['target'].values

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X2, y2, test_size=0.30, stratify=y2, random_state=42
)

X_train2, X_val2, y_train2, y_val2 = train_test_split(
    X_train2, y_train2, test_size=0.20, stratify=y_train2, random_state=42
)


In [16]:
vectorizer2 = TextVectorization(
    max_tokens=10000,
    output_mode="multi_hot",
    ngrams=2,
    sparse=True
)

vectorizer2.adapt(tf.data.Dataset.from_tensor_slices(X_train2).batch(256))

In [17]:
train_ds2 = tf.data.Dataset.from_tensor_slices((X_train2, y_train2)).batch(256)
val_ds2 = tf.data.Dataset.from_tensor_slices((X_val2, y_val2)).batch(256)
test_ds2 = tf.data.Dataset.from_tensor_slices((X_test2, y_test2)).batch(256)

In [18]:
train_ds2 = train_ds2.map(lambda x, y: (vectorizer2(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
val_ds2 = val_ds2.map(lambda x, y: (vectorizer2(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test_ds2 = test_ds2.map(lambda x, y: (vectorizer2(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [19]:
model2 = keras.Model(inputs=inputs, outputs=outputs)
model2.summary()

In [20]:
model2.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [21]:
model2.fit(
    train_ds2,
    validation_data=val_ds2,
    epochs=10
)

Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 1s/step - accuracy: 0.5932 - loss: 0.8178 - val_accuracy: 0.7379 - val_loss: 0.5336
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7386 - loss: 0.5228 - val_accuracy: 0.7710 - val_loss: 0.4864
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7810 - loss: 0.4651 - val_accuracy: 0.7816 - val_loss: 0.4713
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8063 - loss: 0.4285 - val_accuracy: 0.7818 - val_loss: 0.4700
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8203 - loss: 0.4036 - val_accuracy: 0.7846 - val_loss: 0.4711
Epoch 6/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8344 - loss: 0.3837 - val_accuracy: 0.7836 - val_loss: 0.4753
Epoch 7/10
[1m219/219[0m 

<keras.src.callbacks.history.History at 0x7973ac5423f0>

In [22]:
y_prob2 = model2.predict(test_ds2)
y_pred2 = (y_prob2.flatten() > 0.5).astype(int)
acc2 = accuracy_score(y_test2, y_pred2)
print(acc2)

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 157ms/step
0.7791333333333333


In [23]:
X3 = df['clean_text'].values
y3 = df['target'].values

X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X3, y3, test_size=0.30, stratify=y3, random_state=42
)

X_train3, X_val3, y_train3, y_val3 = train_test_split(
    X_train3, y_train3, test_size=0.20, stratify=y_train3, random_state=42
)


In [24]:
vectorizer3 = TextVectorization(
    max_tokens=10000,
    output_mode="tf_idf",
    ngrams=2,
)

vectorizer3.adapt(tf.data.Dataset.from_tensor_slices(X_train3).batch(256))

In [25]:
train_ds3 = tf.data.Dataset.from_tensor_slices((X_train3, y_train3)).batch(256)
val_ds3 = tf.data.Dataset.from_tensor_slices((X_val3, y_val3)).batch(256)
test_ds3 = tf.data.Dataset.from_tensor_slices((X_test3, y_test3)).batch(256)

In [26]:
train_ds3 = train_ds3.map(lambda x, y: (vectorizer3(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
val_ds3 = val_ds3.map(lambda x, y: (vectorizer3(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test_ds3 = test_ds3.map(lambda x, y: (vectorizer3(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [27]:
from tensorflow import keras
from tensorflow.keras import layers

inputs = keras.Input(shape=(10000,), dtype=tf.float32)
x = layers.Dense(64, activation="relu")(inputs)
x = layers.Dropout(0.5)(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model3 = keras.Model(inputs=inputs, outputs=outputs)
model3.summary()

In [28]:
model3.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [29]:
model3.fit(
    train_ds3,
    validation_data=val_ds3,
    epochs=10
)

Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 27ms/step - accuracy: 0.6693 - loss: 0.6116 - val_accuracy: 0.7839 - val_loss: 0.4791
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.7966 - loss: 0.4569 - val_accuracy: 0.7877 - val_loss: 0.4723
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.8217 - loss: 0.4122 - val_accuracy: 0.7864 - val_loss: 0.4771
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.8360 - loss: 0.3846 - val_accuracy: 0.7869 - val_loss: 0.4826
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.8457 - loss: 0.3644 - val_accuracy: 0.7866 - val_loss: 0.4872
Epoch 6/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8583 - loss: 0.3391 - val_accuracy: 0.7819 - val_loss: 0.4933
Epoch 7/10
[1m219/219

<keras.src.callbacks.history.History at 0x7973ac1dc320>

In [30]:
y_prob3 = model3.predict(test_ds3)
y_pred3 = (y_prob3.flatten() > 0.5).astype(int)
acc3 = accuracy_score(y_test3, y_pred3)
print(acc3)

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step
0.7760333333333334
