In [1]:
# importing the dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim
import gensim.downloader
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout


### Preprocessing the data 

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df.drop(columns=['label-fine'])
test_df = test_df.drop(columns=['label-fine'])

# Creating developmental set of 500 rows from train set
train_df, dev_df = train_test_split(train_df, test_size=(500/len(train_df)), random_state=42)

counts = train_df['label-coarse'].value_counts()

# Find the labels with the lowest frequencies
lowest_frequency = [counts.index[-1], counts.index[-2]]
print("Labels being merged to new category: ", lowest_frequency)

# Replace these labels with new label: 6
train_df.loc[(train_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = 6
dev_df.loc[(dev_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = 6
test_df.loc[(test_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = 6

# Convert the labels to one-hot encoding
train_df = pd.get_dummies(train_df, columns=['label-coarse']).replace(True, 1).replace(False, 0)
dev_df = pd.get_dummies(dev_df, columns=['label-coarse']).replace(True, 1).replace(False, 0)
test_df = pd.get_dummies(test_df, columns=['label-coarse']).replace(True, 1).replace(False, 0)

Labels being merged to new category:  [2, 5]


In [3]:
train_df

Unnamed: 0,text,label-coarse_0,label-coarse_1,label-coarse_3,label-coarse_4,label-coarse_6
4943,What is Mikhail Gorbachev 's middle initial ?,0,0,0,0,1
2346,How does the tail affect the flight of a kite ?,1,0,0,0,0
1835,What were the first three cities to have a pop...,0,0,0,0,1
4047,What is the movie Jonathan Livingstone Seagull ?,0,1,0,0,0
5097,What is a fear of home surroundings ?,0,1,0,0,0
...,...,...,...,...,...,...
3772,How much Coca Cola is drunk in one day in the ...,0,0,0,1,0
5191,What cathedral was Thomas Becket murdered in ?,0,0,0,0,1
5226,What character in The Beverly Hillbillies has ...,0,0,1,0,0
5390,What does the River Seine empty into ?,0,0,0,0,1


In [4]:
# Takes about ~30 mins for first run, ~30 seconds afterwards
word2vec = gensim.downloader.load('word2vec-google-news-300')

In [5]:
# Tokenize the sentences in all dataframes using nltk word_tokenize
train_df['tokenized'] = train_df['text'].apply(word_tokenize)
dev_df['tokenized'] = dev_df['text'].apply(word_tokenize)
test_df['tokenized'] = test_df['text'].apply(word_tokenize)

# Declaring '<pad>' as the padding token
word2vec['<pad>'] = np.zeros(300)

# Pad the tokenized sentences to make them all the same length = max length of all sentences
max_len = max(train_df['tokenized'].apply(len).max(), dev_df['tokenized'].apply(len).max(), test_df['tokenized'].apply(len).max())
train_df['tokenized'] = train_df['tokenized'].apply(lambda x: x + ['<pad>'] * (max_len - len(x)))
dev_df['tokenized'] = dev_df['tokenized'].apply(lambda x: x + ['<pad>'] * (max_len - len(x)))
test_df['tokenized'] = test_df['tokenized'].apply(lambda x: x + ['<pad>'] * (max_len - len(x)))

In [6]:
def create_sentence_vectors(sentence):
    sentence_vector = []
    for word in sentence:
        # if word is in word2vec, append the word vector to the sentence vector
        if word in word2vec:
            sentence_vector.append(word2vec[word])
        else:
            sentence_vector.append(word2vec['<pad>'])
    return sentence_vector

# Create sentence vectors for all sentences in all dataframes
train_df['sentence_vectors'] = train_df['tokenized'].apply(create_sentence_vectors)
dev_df['sentence_vectors'] = dev_df['tokenized'].apply(create_sentence_vectors)
test_df['sentence_vectors'] = test_df['tokenized'].apply(create_sentence_vectors)

In [7]:
train_X = np.array([np.array(sentence) for sentence in train_df['sentence_vectors']])
train_y = np.array([np.array(row[1].tolist()) for row in train_df[['label-coarse_0', 'label-coarse_1', 'label-coarse_3', 'label-coarse_4','label-coarse_6']].iterrows()])

dev_X = np.array([np.array(sentence) for sentence in dev_df['sentence_vectors']])
dev_y = np.array([np.array(row[1].tolist()) for row in dev_df[['label-coarse_0', 'label-coarse_1', 'label-coarse_3', 'label-coarse_4','label-coarse_6']].iterrows()])

test_X = np.array([np.array(sentence) for sentence in test_df['sentence_vectors']])
test_y = np.array([np.array(row[1].tolist()) for row in test_df[['label-coarse_0', 'label-coarse_1', 'label-coarse_3', 'label-coarse_4','label-coarse_6']].iterrows()])

In [8]:
train_X.shape, train_y.shape, dev_X.shape, dev_y.shape, test_X.shape, test_y.shape

((4952, 37, 300),
 (4952, 5),
 (500, 37, 300),
 (500, 5),
 (500, 37, 300),
 (500, 5))

In [9]:
model = Sequential([
    LSTM(256, input_shape= (37, 300) ,return_sequences=True),
    LSTM(128, return_sequences=False),
    Dense(5, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 37, 256)           570368    
                                                                 
 lstm_1 (LSTM)               (None, 128)               197120    
                                                                 
 dense (Dense)               (None, 5)                 645       
                                                                 
Total params: 768133 (2.93 MB)
Trainable params: 768133 (2.93 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
batch_size = 128
epochs = 20

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    for i in range(0, len(train_X), batch_size):
        # Select a batch of data
        X_batch = train_X[i:i + batch_size]
        y_batch = train_y[i:i + batch_size]

        # Train the model on the current batch
        model.train_on_batch(X_batch, y_batch)
    
    # Evaluate the model or perform other tasks at the end of each epoch
    loss, accuracy = model.evaluate(dev_X, dev_y)
    print(f'Dev loss: {loss}, Dev accuracy: {accuracy}')

Epoch 1/20
Dev loss: 0.9344730377197266, Dev accuracy: 0.6359999775886536
Epoch 2/20
Dev loss: 0.8438646197319031, Dev accuracy: 0.6620000004768372
Epoch 3/20
Dev loss: 0.7122973203659058, Dev accuracy: 0.7480000257492065
Epoch 4/20
Dev loss: 0.7720882892608643, Dev accuracy: 0.7620000243186951
Epoch 5/20
Dev loss: 0.6451711058616638, Dev accuracy: 0.7680000066757202
Epoch 6/20
Dev loss: 0.6587889194488525, Dev accuracy: 0.8119999766349792
Epoch 7/20
Dev loss: 0.5814089179039001, Dev accuracy: 0.8299999833106995
Epoch 8/20
Dev loss: 0.5878423452377319, Dev accuracy: 0.8299999833106995
Epoch 9/20
Dev loss: 0.5998224020004272, Dev accuracy: 0.7960000038146973
Epoch 10/20
Dev loss: 0.6923705339431763, Dev accuracy: 0.7839999794960022
Epoch 11/20
Dev loss: 0.5092801451683044, Dev accuracy: 0.8519999980926514
Epoch 12/20
Dev loss: 0.48796334862709045, Dev accuracy: 0.8619999885559082
Epoch 13/20
Dev loss: 0.5100526213645935, Dev accuracy: 0.8519999980926514
Epoch 14/20
Dev loss: 0.519173502

In [11]:
# Evaluate the model on the test data using Accuracy
loss, accuracy = model.evaluate(test_X, test_y)



In [12]:
loss, accuracy

(0.5390982031822205, 0.878000020980835)