In [1]:
# importing the dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim
import gensim.downloader
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, SimpleRNN, Flatten
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.initializers import Constant

### Preprocessing the data 

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df.drop(columns=['label-fine'])
test_df = test_df.drop(columns=['label-fine'])

# Creating developmental set of 500 rows from train set
train_df, dev_df = train_test_split(train_df, test_size=(500/len(train_df)), random_state=42)

counts = train_df['label-coarse'].value_counts()

# Find the labels with the lowest frequencies
lowest_frequency = [counts.index[-1], counts.index[-2]]
print("Labels being merged to new category: ", lowest_frequency)
print("Merged into new category: ", lowest_frequency[0])

# Replace these labels with new label: 6
train_df.loc[(train_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = lowest_frequency[0]
dev_df.loc[(dev_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = lowest_frequency[0]
test_df.loc[(test_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = lowest_frequency[0]

train_df.reset_index(drop=True, inplace=True)
dev_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

Labels being merged to new category:  [2, 5]
Merged into new category:  2


In [3]:
train_df

Unnamed: 0,label-coarse,text
0,2,What is Mikhail Gorbachev 's middle initial ?
1,0,How does the tail affect the flight of a kite ?
2,2,What were the first three cities to have a pop...
3,1,What is the movie Jonathan Livingstone Seagull ?
4,1,What is a fear of home surroundings ?
...,...,...
4947,4,How much Coca Cola is drunk in one day in the ...
4948,2,What cathedral was Thomas Becket murdered in ?
4949,3,What character in The Beverly Hillbillies has ...
4950,2,What does the River Seine empty into ?


In [4]:
# Takes about ~30 mins for first run, ~30 seconds afterwards
word2vec = gensim.downloader.load('word2vec-google-news-300')

In [5]:
train_df['text'][0]

"What is Mikhail Gorbachev 's middle initial ?"

In [6]:
# Tokenize the text using word_tokenize
train_df['text'] = train_df['text'].apply(word_tokenize)
dev_df['text'] = dev_df['text'].apply(word_tokenize)
test_df['text'] = test_df['text'].apply(word_tokenize)

In [7]:
train_df['text'][0]

['What', 'is', 'Mikhail', 'Gorbachev', "'s", 'middle', 'initial', '?']

In [8]:
# Count total number of unique tokens in the training data 
unique_tokens = set()
train_df['text'].apply(unique_tokens.update)
# Remove words from the set which are not in the word2vec model
unique_tokens = unique_tokens.intersection(set(word2vec.key_to_index.keys()))
len(unique_tokens)

8170

In [9]:
tok = Tokenizer()
tok.fit_on_texts(train_df['text'])
train_df['text'] = tok.texts_to_sequences(train_df['text'])
dev_df['text'] = tok.texts_to_sequences(dev_df['text'])
test_df['text'] = tok.texts_to_sequences(test_df['text'])

In [10]:
train_df['text'][0]

[3, 4, 1435, 1091, 10, 461, 3214, 1]

In [11]:
X_train_sequences = train_df['text']
X_dev_sequences = dev_df['text']
X_test_sequences = test_df['text']

max_len = max([len(sentence) for sentence in train_df['text']])

# Pad sequences to a fixed length (if needed)
X_train_padded = pad_sequences(X_train_sequences, maxlen= max_len, padding='post', truncating='post')
X_dev_padded = pad_sequences(X_dev_sequences, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post', truncating='post')

# Get the y labels
y_train = train_df['label-coarse']
y_dev = dev_df['label-coarse']
y_test = test_df['label-coarse']

# Convert the labels to categorical
y_train = to_categorical(y_train)
y_dev = to_categorical(y_dev)
y_test = to_categorical(y_test)

In [12]:
X_train_padded[0], y_train[0]

(array([   3,    4, 1435, 1091,   10,  461, 3214,    1,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0], dtype=int32),
 array([0., 0., 1., 0., 0.], dtype=float32))

In [13]:
embedding_dim = len(word2vec['apple']) # Any word vector dimensionality

In [14]:
# Create a dictionary with the unqiue_tokens with the tokens as keys and interger values as values
word_index = {}
for i, token in enumerate(unique_tokens):
    word_index[token] = i

In [15]:
vocab_size = len(word_index) + 1

In [16]:
embedding_matrix = np.zeros(shape=(vocab_size, embedding_dim))

for word, i in word_index.items():
    try:
        embedding_vector = word2vec[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        pass

In [17]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, embeddings_initializer=Constant(embedding_matrix), trainable=False),
    LSTM(512, return_sequences=True),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(5, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 37, 300)           2451300   
                                                                 
 lstm (LSTM)                 (None, 37, 512)           1665024   
                                                                 
 dropout (Dropout)           (None, 37, 512)           0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               328192    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 5)                 645       
                                                                 
Total params: 4445161 (16.96 MB)
Trainable params: 19938

In [18]:
num_epochs = 20
batch_size = 64

In [19]:
model.fit(X_train_padded, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_dev_padded, y_dev))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x157f510c0>

In [24]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test_padded, y_test)

# Print the test accuracy
print(f"Test accuracy: {accuracy}")

Test accuracy: 0.8059999942779541


In [25]:
# Predict on the first 5 test samples
predictions = model.predict(X_test_padded[:5])



In [26]:
predictions

array([[1.0612907e-03, 5.8901549e-04, 1.4004789e-04, 4.7731010e-04,
        9.9773228e-01],
       [5.5304214e-02, 7.6328820e-01, 8.1286289e-02, 6.9917008e-02,
        3.0204363e-02],
       [1.0437068e-03, 1.4343505e-02, 2.0909689e-03, 9.8083234e-01,
        1.6894542e-03],
       [9.7675872e-01, 5.9067165e-03, 1.5905067e-02, 7.3162297e-04,
        6.9796975e-04],
       [1.1127209e-03, 7.0711452e-04, 1.5907611e-04, 5.7052320e-04,
        9.9745053e-01]], dtype=float32)

In [27]:
y_test[:5]

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)