<a href="https://colab.research.google.com/github/KaranTejwani/deep-learning-practise/blob/main/Sentiment_analysis_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path = "/content/drive/MyDrive/datasets/sentiment140.csv"

In [4]:
import pandas as pd
cols = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(path, encoding="latin-1", names=cols)

In [5]:
df = df[df['target'].isin([0,4])].copy()
df['target'] = df['target'].map({0:0, 4:1})

In [6]:
print(df['target'].value_counts())

target
0    800000
1    800000
Name: count, dtype: int64


In [7]:
import re, string
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_text'] = df['text'].map(clean_text)

In [7]:
from sklearn.model_selection import train_test_split
X = df['clean_text'].values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.20, stratify=y_train, random_state=42
)

print("Train / val / Test sizes:", len(X_train), len(X_val), len(X_test))


Train / val / Test sizes: 896000 224000 480000


In [8]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
vectorizer = TextVectorization(
    max_tokens=10000,
    output_mode="multi_hot",
    ngrams=1,
    sparse=True
)
vectorizer.adapt(tf.data.Dataset.from_tensor_slices(X_train).batch(256))

In [9]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(256)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(256)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(256)

In [10]:
train_ds = train_ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [11]:
from tensorflow import keras
from tensorflow.keras import layers

inputs = keras.Input(shape=(10000,), sparse=True, dtype=tf.float32)
# convert sparse to dense (this op is supported and avoids SparseFillEmptyRows in matmul path)
x = layers.Lambda(lambda s: tf.sparse.to_dense(s, default_value=0.0), output_shape=(10000,))(inputs)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

In [12]:
model.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [13]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5
)

Epoch 1/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m771s[0m 220ms/step - accuracy: 0.7675 - loss: 0.4904 - val_accuracy: 0.7990 - val_loss: 0.4406
Epoch 2/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.7988 - loss: 0.4411 - val_accuracy: 0.8025 - val_loss: 0.4344
Epoch 3/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 6ms/step - accuracy: 0.8039 - loss: 0.4328 - val_accuracy: 0.8041 - val_loss: 0.4320
Epoch 4/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 4ms/step - accuracy: 0.8074 - loss: 0.4282 - val_accuracy: 0.8048 - val_loss: 0.4305
Epoch 5/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4ms/step - accuracy: 0.8097 - loss: 0.4243 - val_accuracy: 0.8051 - val_loss: 0.4297


In [14]:
print(f"Test acc: {model.evaluate(test_ds)[1]:.3f}")

[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 48ms/step
0.8059125


In [15]:
X2 = df['clean_text'].values
y2 = df['target'].values

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X2, y2, test_size=0.30, stratify=y2, random_state=42
)

X_train2, X_val2, y_train2, y_val2 = train_test_split(
    X_train2, y_train2, test_size=0.20, stratify=y_train2, random_state=42
)


In [16]:
vectorizer2 = TextVectorization(
    max_tokens=10000,
    output_mode="multi_hot",
    ngrams=2,
    sparse=True
)

vectorizer2.adapt(tf.data.Dataset.from_tensor_slices(X_train2).batch(256))

In [17]:
train_ds2 = tf.data.Dataset.from_tensor_slices((X_train2, y_train2)).batch(256)
val_ds2 = tf.data.Dataset.from_tensor_slices((X_val2, y_val2)).batch(256)
test_ds2 = tf.data.Dataset.from_tensor_slices((X_test2, y_test2)).batch(256)

In [18]:
train_ds2 = train_ds2.map(lambda x, y: (vectorizer2(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
val_ds2 = val_ds2.map(lambda x, y: (vectorizer2(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test_ds2 = test_ds2.map(lambda x, y: (vectorizer2(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [19]:
model2 = keras.Model(inputs=inputs, outputs=outputs)
model2.summary()

In [20]:
model2.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [21]:
model2.fit(
    train_ds2,
    validation_data=val_ds2,
    epochs=5
)

Epoch 1/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m994s[0m 284ms/step - accuracy: 0.6997 - loss: 0.5680 - val_accuracy: 0.7978 - val_loss: 0.4440
Epoch 2/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - accuracy: 0.7951 - loss: 0.4472 - val_accuracy: 0.8029 - val_loss: 0.4361
Epoch 3/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 5ms/step - accuracy: 0.8030 - loss: 0.4339 - val_accuracy: 0.8047 - val_loss: 0.4322
Epoch 4/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 5ms/step - accuracy: 0.8070 - loss: 0.4276 - val_accuracy: 0.8053 - val_loss: 0.4309
Epoch 5/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - accuracy: 0.8100 - loss: 0.4227 - val_accuracy: 0.8063 - val_loss: 0.4292


<keras.src.callbacks.history.History at 0x7dd4300e0560>

In [22]:
print(f"Test acc: {model2.evaluate(test_ds2)[1]:.3f}")

[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 60ms/step
0.8068854166666667


In [23]:
X3 = df['clean_text'].values
y3 = df['target'].values

X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X3, y3, test_size=0.30, stratify=y3, random_state=42
)

X_train3, X_val3, y_train3, y_val3 = train_test_split(
    X_train3, y_train3, test_size=0.20, stratify=y_train3, random_state=42
)


In [24]:
vectorizer3 = TextVectorization(
    max_tokens=10000,
    output_mode="tf_idf",
    ngrams=2,
)

vectorizer3.adapt(tf.data.Dataset.from_tensor_slices(X_train3).batch(256))

In [25]:
train_ds3 = tf.data.Dataset.from_tensor_slices((X_train3, y_train3)).batch(256)
val_ds3 = tf.data.Dataset.from_tensor_slices((X_val3, y_val3)).batch(256)
test_ds3 = tf.data.Dataset.from_tensor_slices((X_test3, y_test3)).batch(256)

In [26]:
train_ds3 = train_ds3.map(lambda x, y: (vectorizer3(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
val_ds3 = val_ds3.map(lambda x, y: (vectorizer3(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test_ds3 = test_ds3.map(lambda x, y: (vectorizer3(x), y), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [27]:
from tensorflow import keras
from tensorflow.keras import layers

inputs = keras.Input(shape=(10000,), dtype=tf.float32)
x = layers.Dense(64, activation="relu")(inputs)
x = layers.Dropout(0.5)(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model3 = keras.Model(inputs=inputs, outputs=outputs)
model3.summary()

In [28]:
model3.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [None]:
model3.fit(
    train_ds3,
    validation_data=val_ds3,
    epochs=5
)

Epoch 1/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 19ms/step - accuracy: 0.7670 - loss: 0.4906 - val_accuracy: 0.8039 - val_loss: 0.4351
Epoch 2/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 20ms/step - accuracy: 0.8022 - loss: 0.4352 - val_accuracy: 0.8058 - val_loss: 0.4311
Epoch 3/5
[1m   9/3500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m37:13[0m 640ms/step - accuracy: 0.7825 - loss: 0.4707

In [None]:
print(f"Test acc: {model3.evaluate(test_ds3)[1]:.3f}")

In [1]:
max_features = 10000
max_len = 140
embedding_dim = 100

In [None]:
cols = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(path, encoding="latin-1", names=cols)

df = df[df['target'].isin([0,4])].copy()
df['target'] = df['target'].map({0:0, 4:1})

df = df.sample(n=100000, random_state=42).reset_index(drop=True)
print("Sampled dataset size:", len(df))
print(df['target'].value_counts())

In [None]:
df['clean_text'] = df['text'].map(clean_text)

In [9]:
from sklearn.model_selection import train_test_split
from collections import Counter
X4 = df['clean_text'].values
y4 = df['target'].values

X_train4, X_test4, y_train4, y_test4 = train_test_split(
    X4, y4, test_size=0.30, stratify=y4, random_state=42
)

X_train4, X_val4, y_train4, y_val4 = train_test_split(
    X_train4, y_train4, test_size=0.20, stratify=y_train4, random_state=42
)

print("Train / val / Test sizes:", len(X_train4), len(X_val4), len(X_test4))


print("y_train4 distribution:", Counter(y_train4))
print("y_val4 distribution:", Counter(y_val4))
print("y_test4 distribution:", Counter(y_test4))


Train / val / Test sizes: 896000 224000 480000
y_train4 distribution: Counter({np.int64(1): 448000, np.int64(0): 448000})
y_val4 distribution: Counter({np.int64(1): 112000, np.int64(0): 112000})
y_test4 distribution: Counter({np.int64(0): 240000, np.int64(1): 240000})


In [11]:
import tensorflow as tf
train_ds4 = tf.data.Dataset.from_tensor_slices((X_train4, y_train4)).batch(256)
val_ds4 = tf.data.Dataset.from_tensor_slices((X_val4, y_val4)).batch(256)
test_ds4 = tf.data.Dataset.from_tensor_slices((X_test4, y_test4)).batch(256)

In [12]:
from tensorflow.keras.layers import TextVectorization
simple_vectorizer = TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=max_len,
)

In [14]:
simple_vectorizer.adapt(tf.data.Dataset.from_tensor_slices(X_train4).batch(256))

In [15]:
train_ds4 = train_ds4.map(lambda x, y: (simple_vectorizer(x), y), num_parallel_calls=4)
val_ds4 = val_ds4.map(lambda x, y: (simple_vectorizer(x), y), num_parallel_calls=4)
test_ds4 = test_ds4.map(lambda x, y: (simple_vectorizer(x), y), num_parallel_calls=4)

In [18]:
from tensorflow import keras
from tensorflow.keras import layers

def get_lstm_model(max_tokens=max_features, embedding_dim=embedding_dim):
    inputs = keras.Input(shape=(max_len,), dtype="int64")
    x = layers.Embedding(input_dim=max_tokens, output_dim=embedding_dim, input_length=max_len)(inputs)
    x = layers.LSTM(32)(x)
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dense(16, activation="relu")(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [19]:
model4 = get_lstm_model()
model4.summary()
model4.fit(train_ds4.cache(), validation_data=val_ds4.cache(), epochs=5)



Epoch 1/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 14ms/step - accuracy: 0.5014 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 2/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 12ms/step - accuracy: 0.5006 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 3/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 12ms/step - accuracy: 0.5006 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 4/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 12ms/step - accuracy: 0.5006 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 5/5
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 14ms/step - accuracy: 0.5006 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6932


<keras.src.callbacks.history.History at 0x7e4648594920>

In [20]:
print(f"Test acc: {model4.evaluate(test_ds4)[1]:.3f}")

[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.4997 - loss: 0.6932
Test acc: 0.500


In [21]:
glove_path = "/content/drive/MyDrive/embeddings/glove.6B.100d.txt"
glove_tw_path = "/content/drive/MyDrive/embeddings/glove.twitter.27B.100d.txt"
fasttext_path = "/content/drive/MyDrive/embeddings/wiki-news-300d-1M.vec"

In [22]:
import numpy as np
embeddings_index = {}
with open(glove_path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [23]:
print(embeddings_index["politics"])

[-0.54286    0.45469    0.64719   -0.22052    0.091599   0.81324
 -0.3554    -0.92615    0.06243    0.36073   -0.49732    0.043933
  0.23045    0.69455   -0.57958   -0.5164    -0.035935  -0.11529
 -0.035968  -0.2969     0.29382    0.41963    0.87894   -1.0599
 -0.60182    0.063635   0.13589   -0.77977    0.56723   -0.16312
 -0.16661    1.0178    -0.19692   -0.31214   -0.66151    0.055427
 -0.26475   -0.13708   -0.8516     0.14803   -1.0331    -0.66836
 -0.33211   -0.34915   -1.1274    -0.82394    0.36979    0.22956
 -0.064556  -0.84839    0.22205   -0.0028985  0.12552    1.145
  0.036419  -2.0007     0.2888     0.2233     1.1626     0.51087
 -0.46021    0.28644   -0.35541   -0.66242    1.2545     0.40071
  0.61735   -0.2475     0.4744     0.58048    0.078492  -0.011721
 -0.17681    0.41396   -0.76944   -0.56667   -0.15648    0.047751
 -0.86167   -0.24345   -0.26801    0.42682    0.41777   -0.013506
 -0.59554    0.12376   -0.95499    0.10357   -0.14985   -1.5794
  0.22291    0.13318   -

In [24]:
vocabulary = simple_vectorizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [25]:
embedding_layer = layers.Embedding(
    max_features,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

In [26]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.LSTM(32)(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model6 = keras.Model(inputs, outputs)
model6.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model6.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras",
                                    save_best_only=True)
]
model6.fit(train_ds4, validation_data=val_ds4, epochs=10, callbacks=callbacks)
model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test acc: {model.evaluate(test_ds4)[1]:.3f}")

Epoch 1/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 10ms/step - accuracy: 0.7133 - loss: 0.5559 - val_accuracy: 0.7765 - val_loss: 0.4708
Epoch 2/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 9ms/step - accuracy: 0.7754 - loss: 0.4748 - val_accuracy: 0.7906 - val_loss: 0.4475
Epoch 3/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 10ms/step - accuracy: 0.7856 - loss: 0.4575 - val_accuracy: 0.7966 - val_loss: 0.4369
Epoch 4/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 9ms/step - accuracy: 0.7914 - loss: 0.4483 - val_accuracy: 0.8001 - val_loss: 0.4318
Epoch 5/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 9ms/step - accuracy: 0.7952 - loss: 0.4421 - val_accuracy: 0.8032 - val_loss: 0.4264
Epoch 6/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 9ms/step - accuracy: 0.7983 - loss: 0.4370 - val_accuracy: 0.8056 - val_loss: 0.4230
Epoch 7/

In [28]:
import numpy as np
embeddings_index = {}
with open(glove_tw_path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 1193514 word vectors.


In [30]:
print(embeddings_index["politics"])

[ 3.9138e-01 -5.7569e-01 -4.0371e-02  8.9331e-02 -1.5830e-01  5.8045e-01
 -4.6133e-02 -1.9539e-01  5.2693e-01  7.0310e-02  1.1452e-01 -1.1493e+00
 -3.3141e+00 -8.1970e-02  6.0928e-01 -5.4354e-01 -1.0348e+00 -5.5647e-01
 -9.1685e-02  5.3608e-01  4.8422e-01 -1.6884e-02 -1.4106e-01  6.4015e-01
 -1.9088e-01  6.4131e-01  2.9661e-01  1.8009e-01  5.9771e-01  2.4903e-01
  4.1498e-01 -1.9357e-01 -4.2837e-01  6.5782e-01  3.2177e-01 -6.8327e-01
 -4.2335e-02  1.2098e+00  4.0060e-01 -5.2182e-01  3.5069e-01  1.1047e-01
  5.6895e-01 -6.4784e-01  4.8661e-01  1.2065e-01 -1.1636e-01  9.0751e-01
 -3.9169e-01 -2.9615e-01 -2.4831e-01 -1.0978e-01  1.7916e-01  1.3615e-01
 -4.5893e-01  5.4925e-04 -5.0649e-02 -9.4077e-01 -5.9727e-01 -9.4947e-01
 -1.4206e-01 -1.1460e-02 -1.7034e-01  3.1252e-02 -2.3492e-01  3.3504e-02
  2.4929e-01  9.6771e-01  3.1744e-01  2.0787e-01 -7.2851e-01 -1.9214e-01
  2.4217e-01 -6.7458e-02  3.9778e-01  2.5059e-01  1.8463e-01  6.5781e-01
 -3.3154e-01 -2.0674e-01  4.3531e-01  3.8255e-01  1

In [29]:
vocabulary = simple_vectorizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [31]:
embedding_layer = layers.Embedding(
    max_features,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

In [32]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.LSTM(32)(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model7 = keras.Model(inputs, outputs)
model7.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model7.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("glove_tw_embeddings_sequence_model.keras",
                                    save_best_only=True)
]
model7.fit(train_ds4, validation_data=val_ds4, epochs=10, callbacks=callbacks)
model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test acc: {model.evaluate(test_ds4)[1]:.3f}")

Epoch 1/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.7483 - loss: 0.5107 - val_accuracy: 0.7955 - val_loss: 0.4379
Epoch 2/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 11ms/step - accuracy: 0.7930 - loss: 0.4452 - val_accuracy: 0.8044 - val_loss: 0.4226
Epoch 3/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 11ms/step - accuracy: 0.8008 - loss: 0.4325 - val_accuracy: 0.8082 - val_loss: 0.4154
Epoch 4/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 9ms/step - accuracy: 0.8050 - loss: 0.4250 - val_accuracy: 0.8110 - val_loss: 0.4105
Epoch 5/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 9ms/step - accuracy: 0.8080 - loss: 0.4203 - val_accuracy: 0.8130 - val_loss: 0.4076
Epoch 6/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 10ms/step - accuracy: 0.8102 - loss: 0.4167 - val_accuracy: 0.8148 - val_loss: 0.4052
Epoch 

In [41]:
import numpy as np
embeddings_index = {}
with open(fasttext_path, encoding="utf-8", errors="ignore") as f:
    next(f)
    for line in f:
        try:
            values = line.rstrip().split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
        except ValueError:
            # Skip lines that don't have the expected format
            continue

print(f"Found {len(embeddings_index)} word vectors.")

Found 3400609 word vectors.


In [42]:
vocabulary = simple_vectorizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

In [45]:
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
        embedding_vector = embeddings_index.get(word)
        # Add a check for the correct shape of the embedding vector
        if embedding_vector is not None and embedding_vector.shape == (embedding_dim,):
            embedding_matrix[i] = embedding_vector

In [46]:
embedding_layer = layers.Embedding(
    input_dim=max_features,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

In [47]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.LSTM(32)(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model_ft = keras.Model(inputs, outputs)
model_ft.compile(
    optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model_ft.summary()

# -------------------------------
# 6. Train with callbacks
# -------------------------------
callbacks = [
    keras.callbacks.ModelCheckpoint("fasttext_sequence_model.keras", save_best_only=True)
]

model_ft.fit(train_ds4, validation_data=val_ds4, epochs=10, callbacks=callbacks)
model = keras.models.load_model("fasttext_sequence_model.keras")
print(f"Test acc: {model.evaluate(test_ds4)[1]:.3f}")

Epoch 1/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 9ms/step - accuracy: 0.5132 - loss: 0.6925 - val_accuracy: 0.5208 - val_loss: 0.6916
Epoch 2/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 9ms/step - accuracy: 0.5180 - loss: 0.6920 - val_accuracy: 0.5208 - val_loss: 0.6916
Epoch 3/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 9ms/step - accuracy: 0.5179 - loss: 0.6918 - val_accuracy: 0.5202 - val_loss: 0.6915
Epoch 4/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 9ms/step - accuracy: 0.5183 - loss: 0.6917 - val_accuracy: 0.5208 - val_loss: 0.6915
Epoch 5/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 9ms/step - accuracy: 0.5181 - loss: 0.6917 - val_accuracy: 0.5208 - val_loss: 0.6915
Epoch 6/10
[1m3500/3500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 9ms/step - accuracy: 0.5187 - loss: 0.6916 - val_accuracy: 0.5202 - val_loss: 0.6915
Epoch 7/10