# **LSTM**

##### Imports

In [6]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical 
import re


##### Data preprocessing

In [7]:
data = pd.read_csv(r"E:\vs code\Datasets\Twitter_Data.csv")
data.head()  

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [13]:
data.isnull().sum()

clean_text    4
category      7
dtype: int64

In [14]:
data= data.dropna()

In [15]:
data.isnull().sum()

clean_text    0
category      0
dtype: int64

In [17]:
data.duplicated().sum()

0

In [18]:
data['category'].value_counts()

category
 1.0    72249
 0.0    55211
-1.0    35509
Name: count, dtype: int64

In [22]:
data['clean_text'] = data['clean_text'].apply(lambda x: x.lower())
data['clean_text'] = data['clean_text'].apply((lambda x: re.sub('[^a-zA-z\s0-9]','',x)))

In [23]:
data.head()

Unnamed: 0,clean_text,category
0,when modi promised minimum government maximum ...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


##### Modeling

In [24]:
max_features = 2500
tokenizer = Tokenizer(num_words = max_features, split=' ')
tokenizer.fit_on_texts(data['clean_text'].values)
x = tokenizer.texts_to_sequences(data['clean_text'].values)
x = pad_sequences(x)

In [25]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = x.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(lstm_out, dropout=0.4))

model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [26]:
y = pd.get_dummies(data['category']).values
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 13)

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(114078, 48) (114078, 3)
(48891, 48) (48891, 3)


In [28]:
model.fit(X_train, y_train,
          epochs = 8,
          batch_size=64,
          verbose = 2,
        validation_data=(X_test,y_test))

Epoch 1/8
1783/1783 - 225s - loss: 0.3123 - accuracy: 0.9075 - val_loss: 0.2790 - val_accuracy: 0.9210 - 225s/epoch - 126ms/step
Epoch 2/8
1783/1783 - 237s - loss: 0.2881 - accuracy: 0.9148 - val_loss: 0.2595 - val_accuracy: 0.9273 - 237s/epoch - 133ms/step
Epoch 3/8
1783/1783 - 245s - loss: 0.2772 - accuracy: 0.9191 - val_loss: 0.2552 - val_accuracy: 0.9298 - 245s/epoch - 137ms/step
Epoch 4/8
1783/1783 - 244s - loss: 0.2701 - accuracy: 0.9206 - val_loss: 0.2538 - val_accuracy: 0.9302 - 244s/epoch - 137ms/step
Epoch 5/8
1783/1783 - 246s - loss: 0.2640 - accuracy: 0.9224 - val_loss: 0.2522 - val_accuracy: 0.9308 - 246s/epoch - 138ms/step
Epoch 6/8
1783/1783 - 248s - loss: 0.2585 - accuracy: 0.9236 - val_loss: 0.2529 - val_accuracy: 0.9303 - 248s/epoch - 139ms/step
Epoch 7/8
1783/1783 - 250s - loss: 0.2560 - accuracy: 0.9238 - val_loss: 0.2509 - val_accuracy: 0.9316 - 250s/epoch - 140ms/step
Epoch 8/8
1783/1783 - 246s - loss: 0.2531 - accuracy: 0.9243 - val_loss: 0.2530 - val_accuracy: 0

<keras.src.callbacks.History at 0x276b701a100>