In [30]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [31]:
import pandas as pd
import numpy as np
import keras
import matplotlib.pyplot as plt
keras.utils.set_random_seed(42)
from sklearn.model_selection import train_test_split

In [32]:
train_df = pd.read_csv("nlp-getting-started/train.csv")
test_df = pd.read_csv("nlp-getting-started/test.csv")
print("train_df size: ", train_df.shape)
print("test_df size: ", test_df.shape)
print(train_df.head())
# handling missing data 
train_df['text'] = train_df['text'].fillna('')
test_df['text'] = test_df['text'].fillna('')

train_df size:  (7613, 5)
test_df size:  (3263, 4)
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [33]:
# split for validation data set
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'].values, 
    train_df['target'].values,
    test_size = 0.2,  # 0.2 but can change
    random_state = 42,
    stratify=train_df['target'].values  # keeps class distribution balanced
)

In [34]:
# generic NLP model mostly following class's colabs
# bag of words
max_tokens = 5000
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="multi_hot"
)

text_vectorization.adapt(train_texts)

# vectorize tweets
X_train = text_vectorization(train_texts)
X_val = text_vectorization(val_texts)
X_test = text_vectorization(test_df['text'].values)
# defining targets
y_train = train_labels
y_val = val_labels

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (6090, 5000)
X_val shape: (1523, 5000)
X_test shape: (3263, 5000)


In [35]:
# build model
inputs = keras.layers.Input(shape=(max_tokens,))
x = keras.layers.Dense(8, activation="relu")(inputs)
# since binary classification, can use sigmoid 
outputs = keras.layers.Dense(1, activation="sigmoid")(x) 

model = keras.Model(inputs, outputs)
model.summary()

model.compile(
    optimizer="adam",
    loss="binary_crossentropy", # since two classes 
    metrics=["accuracy"]
)

history = model.fit(
    x = X_train, 
    y = y_train,
    validation_data = (X_val, y_val),
    epochs = 10,
    batch_size = 32
)

# evaluate on validation set since test set no label
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"validation accuracy: {val_accuracy:.4f}")

Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7335 - loss: 0.6146 - val_accuracy: 0.8030 - val_loss: 0.5215
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8309 - loss: 0.4523 - val_accuracy: 0.8155 - val_loss: 0.4546
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 898us/step - accuracy: 0.8624 - loss: 0.3670 - val_accuracy: 0.8109 - val_loss: 0.4407
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 879us/step - accuracy: 0.8849 - loss: 0.3137 - val_accuracy: 0.8017 - val_loss: 0.4440
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8995 - loss: 0.2746 - val_accuracy: 0.7997 - val_loss: 0.4548
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 849us/step - accuracy: 0.9136 - loss: 0.2436 - val_accuracy: 0.7945 - val_loss: 0.4698
Epoch 7/10
[1m191/191

In [36]:
# bag of words but but with bigrams
max_tokens = 5000
text_vectorization = keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="multi_hot",
    ngrams=2  # both unigrams and bigrams 
)

text_vectorization.adapt(train_texts)

# vectorize all splits
X_train = text_vectorization(train_texts)
X_val = text_vectorization(val_texts)
X_test = text_vectorization(test_df['text'].values)
y_train = train_labels
y_val = val_labels

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

# build model
inputs = keras.layers.Input(shape=(max_tokens,))
x = keras.layers.Dense(8, activation="relu")(inputs)
outputs = keras.layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.summary()

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(
    x=X_train, 
    y=y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32
)

# evaluate on validation set
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"\nValidation accuracy: {val_accuracy:.4f}")

X_train shape: (6090, 5000)
X_val shape: (1523, 5000)
X_test shape: (3263, 5000)


Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7128 - loss: 0.6299 - val_accuracy: 0.7932 - val_loss: 0.5395
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 983us/step - accuracy: 0.8307 - loss: 0.4696 - val_accuracy: 0.8116 - val_loss: 0.4546
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 945us/step - accuracy: 0.8603 - loss: 0.3793 - val_accuracy: 0.8083 - val_loss: 0.4357
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 882us/step - accuracy: 0.8824 - loss: 0.3249 - val_accuracy: 0.8030 - val_loss: 0.4373
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 963us/step - accuracy: 0.8997 - loss: 0.2854 - val_accuracy: 0.8017 - val_loss: 0.4452
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 919us/step - accuracy: 0.9120 - loss: 0.2541 - val_accuracy: 0.7971 - val_loss: 0.4574
Epoch 7/10
[1m191