In [1]:
# importing the dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim
import gensim.downloader
from nltk.tokenize import word_tokenize
import tensorflow as tf

### Preprocessing the data 

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df.drop(columns=['label-fine'])
test_df = test_df.drop(columns=['label-fine'])

# Creating developmental set of 500 rows from train set
train_df, dev_df = train_test_split(train_df, test_size=(500/len(train_df)), random_state=42)

counts = train_df['label-coarse'].value_counts()

# Find the labels with the lowest frequencies
lowest_frequency = [counts.index[-1], counts.index[-2]]

# Replace these labels with new label: 6
train_df.loc[(train_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = 6
dev_df.loc[(dev_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = 6
test_df.loc[(test_df['label-coarse'].isin(lowest_frequency)), 'label-coarse'] = 6

counts_new = train_df['label-coarse'].value_counts()

In [3]:
counts, counts_new

(label-coarse
 1    1136
 3    1126
 0    1053
 4     807
 5     754
 2      76
 Name: count, dtype: int64,
 label-coarse
 1    1136
 3    1126
 0    1053
 6     830
 4     807
 Name: count, dtype: int64)

In [4]:
# Takes about ~30 mins for first run, ~30 seconds afterwards
word2vec = gensim.downloader.load('word2vec-google-news-300')

In [5]:
# Tokenize the sentences in all dataframes using nltk word_tokenize
train_df['tokenized'] = train_df['text'].apply(word_tokenize)
dev_df['tokenized'] = dev_df['text'].apply(word_tokenize)
test_df['tokenized'] = test_df['text'].apply(word_tokenize)

# Declaring '<pad>' as the padding token
word2vec['<pad>'] = np.zeros(300)

# Pad the tokenized sentences to make them all the same length = max length of all sentences
max_len = max(train_df['tokenized'].apply(len).max(), dev_df['tokenized'].apply(len).max(), test_df['tokenized'].apply(len).max())
train_df['tokenized'] = train_df['tokenized'].apply(lambda x: x + ['<pad>'] * (max_len - len(x)))
dev_df['tokenized'] = dev_df['tokenized'].apply(lambda x: x + ['<pad>'] * (max_len - len(x)))
test_df['tokenized'] = test_df['tokenized'].apply(lambda x: x + ['<pad>'] * (max_len - len(x)))

In [6]:
def create_sentence_vectors(sentence):
    sentence_vector = []
    for word in sentence:
        # if word is in word2vec, append the word vector to the sentence vector
        if word in word2vec:
            sentence_vector.append(word2vec[word])
        else:
            sentence_vector.append(word2vec['<pad>'])
    return sentence_vector

# Create sentence vectors for all sentences in all dataframes
train_df['sentence_vectors'] = train_df['tokenized'].apply(create_sentence_vectors)
dev_df['sentence_vectors'] = dev_df['tokenized'].apply(create_sentence_vectors)
test_df['sentence_vectors'] = test_df['tokenized'].apply(create_sentence_vectors)

In [7]:
train_X = np.array([np.array(sentence) for sentence in train_df['sentence_vectors']])
train_y = np.array([[num] for num in train_df['label-coarse']])

dev_X = np.array([np.array(sentence) for sentence in dev_df['sentence_vectors']])
dev_y = np.array([[num] for num in dev_df['label-coarse']])

test_X = np.array([np.array(sentence) for sentence in test_df['sentence_vectors']])
test_y = np.array([[num] for num in test_df['label-coarse']])

In [8]:
train_X.shape, train_y.shape, dev_X.shape, dev_y.shape, test_X.shape, test_y.shape

((4952, 37, 300),
 (4952, 1),
 (500, 37, 300),
 (500, 1),
 (500, 37, 300),
 (500, 1))

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(5000, 300, input_length=max_len),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.Dense(5, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 37, 300)           1500000   
                                                                 
 lstm_6 (LSTM)               (None, 37, 128)           219648    
                                                                 
 lstm_7 (LSTM)               (None, 37, 128)           131584    
                                                                 
 dense_3 (Dense)             (None, 37, 5)             645       
                                                                 
Total params: 1851877 (7.06 MB)
Trainable params: 1851877 (7.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
batch_size = 1024
epochs = 20

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    for i in range(0, len(train_X), batch_size):
        # Select a batch of data
        X_batch = train_X[i:i + batch_size]
        y_batch = train_y[i:i + batch_size]

        # Train the model on the current batch
        model.train_on_batch(X_batch, y_batch)
    
    # Evaluate the model or perform other tasks at the end of each epoch
    loss, accuracy = model.evaluate(dev_X, dev_y)
    print(f'Dev loss: {loss}, Dev accuracy: {accuracy}')

Epoch 1/20


ValueError: in user code:

    File "/opt/miniconda3/envs/nlpa1/lib/python3.10/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/opt/miniconda3/envs/nlpa1/lib/python3.10/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/miniconda3/envs/nlpa1/lib/python3.10/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/opt/miniconda3/envs/nlpa1/lib/python3.10/site-packages/keras/src/engine/training.py", line 1126, in train_step
        y_pred = self(x, training=True)
    File "/opt/miniconda3/envs/nlpa1/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/miniconda3/envs/nlpa1/lib/python3.10/site-packages/keras/src/engine/input_spec.py", line 235, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_3' (type Sequential).
    
    Input 0 of layer "lstm_6" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (1024, 37, 300, 300)
    
    Call arguments received by layer 'sequential_3' (type Sequential):
      • inputs=tf.Tensor(shape=(1024, 37, 300), dtype=float32)
      • training=True
      • mask=None
