In [1]:
# Import necessary libraries (pandas, numpy, re, tensorflow, and keras)

import pandas as pd
import numpy as np
import re
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential
from keras.utils import to_categorical

In [2]:
# Read the train and test data from csv files using pandas and Concatenate the train and test data 

train_df=pd.read_csv("Train.csv")
test_df=pd.read_csv("Test.csv")
df=pd.concat([train_df,test_df])

In [3]:
# drop the 'id' column
df = df.drop(['id'], axis=1)
df = df.reset_index(drop=True)

In [4]:
# drop the 'id' column

train_df = train_df.drop(['id'], axis=1)
train_df = train_df.reset_index(drop=True)
test_df = test_df.drop(['id'], axis=1)
test_df = test_df.reset_index(drop=True)

In [5]:
# Remove duplicates from the concatenated data, train data and test data

train_df.duplicated(subset= ['article', 'highlights']).sum()
train_df = train_df.drop_duplicates(subset= ['article', 'highlights'])

test_df.duplicated(subset= ['article', 'highlights']).sum()
test_df = test_df.drop_duplicates(subset= ['article', 'highlights'])

df.duplicated(subset= ['article', 'highlights']).sum()
df = df.drop_duplicates(subset= ['article', 'highlights'])

In [6]:
# Assign the data to variable X and y
X = df.iloc[:,:-1].values
y = df.iloc[:,1].values
y = y.reshape(-1,1)
print(df.shape)

(128, 2)



print(df.isna().sum())
print(train_df.info())
print(df.info())


In [8]:
# Tokenize the text using the Tokenizer class 

vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['article'])
sequences = tokenizer.texts_to_sequences(df['article'])

In [9]:
# Pad the sequences to the same length

max_length = max([len(s) for s in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [10]:
# prepare the target variable and one hot encoded version of it

target = df['highlights']
target_classes = list(set(target))
target_classes_num = {target_classes[i]:i for i in range(len(target_classes))}
target_num = [target_classes_num[t] for t in target]
target_one_hot = to_categorical(target_num)

In [11]:
# Build the model using Embedding, LSTM, and Dense layers

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_length))
model.add(LSTM(units=100, return_sequences=True))
model.add(LSTM(units=100))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=len(target_classes), activation='softmax'))

# Compile the model with 'adam' optimizer, 'categorical_crossentropy' loss, and 'accuracy' metrics

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [12]:
# Train the model
history = model.fit(padded_sequences, target_one_hot, epochs=25, batch_size=32)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [13]:
# Access the training loss and accuracy history
loss = history.history['loss']
acc = history.history['accuracy']

# Evaluate the model on the test dataset
test_sequences = tokenizer.texts_to_sequences(test_df['article'])
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length)
test_target = test_df['highlights']
test_target_num = [target_classes_num[t] for t in test_target]
test_target_one_hot = to_categorical(test_target_num)

test_loss, test_acc = model.evaluate(test_padded_sequences, test_target_one_hot)
print('Accuracy on test dataset:', test_acc)

Accuracy on test dataset: 0.48275861144065857
