In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sub = pd.read_csv('sample_submission.csv')

In [34]:
X_train = df_train['text']
y_train = df_train['target']
X_test = df_test['text']

In [35]:
from keras import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.layers import Dense, LSTM, Embedding

In [36]:
token = Tokenizer()

In [37]:
token.fit_on_texts(X_train)

In [38]:
token.word_index

{'t': 1,
 'co': 2,
 'http': 3,
 'the': 4,
 'a': 5,
 'in': 6,
 'to': 7,
 'of': 8,
 'and': 9,
 'i': 10,
 'is': 11,
 'for': 12,
 'on': 13,
 'you': 14,
 'my': 15,
 'with': 16,
 'that': 17,
 'it': 18,
 'at': 19,
 'by': 20,
 'this': 21,
 'from': 22,
 'https': 23,
 'are': 24,
 'be': 25,
 'was': 26,
 'have': 27,
 'like': 28,
 'amp': 29,
 'as': 30,
 'up': 31,
 'me': 32,
 'but': 33,
 'just': 34,
 'so': 35,
 'not': 36,
 'your': 37,
 'out': 38,
 'no': 39,
 'all': 40,
 'after': 41,
 'will': 42,
 'an': 43,
 'has': 44,
 'fire': 45,
 "i'm": 46,
 'when': 47,
 'if': 48,
 'we': 49,
 'get': 50,
 'now': 51,
 'new': 52,
 'via': 53,
 'more': 54,
 '2': 55,
 'about': 56,
 'or': 57,
 'news': 58,
 'people': 59,
 'what': 60,
 'they': 61,
 'one': 62,
 'he': 63,
 'how': 64,
 'been': 65,
 'over': 66,
 'who': 67,
 "it's": 68,
 'into': 69,
 "don't": 70,
 'do': 71,
 'video': 72,
 "'": 73,
 'can': 74,
 'emergency': 75,
 'disaster': 76,
 'there': 77,
 'police': 78,
 'than': 79,
 '3': 80,
 'her': 81,
 'u': 82,
 'would': 8

In [39]:
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [40]:
max_len = max([len(i) for i in X_train])

In [41]:
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

In [42]:
X_train.shape

(7613, 33)

In [43]:
X_test.shape

(3263, 33)

In [44]:
y_train.shape

(7613,)

In [45]:
y_train

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [46]:
from keras.utils import to_categorical

In [47]:
y_train = to_categorical(y_train, num_classes=2)

In [48]:
y_train.shape

(7613, 2)

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [51]:
from keras.optimizers import Adam

In [52]:
word_count = len(token.word_index) + 1

In [53]:
y_train.shape

(6471, 2)

In [54]:
y_val.shape

(1142, 2)

In [55]:
model = Sequential()

model.add(Embedding(input_dim=word_count, output_dim=20, input_length=max_len))
model.add(LSTM(input_dim=20,units=20, activation='relu'))

model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

  super().__init__(**kwargs)


In [56]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [57]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20)

Epoch 1/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.5684 - loss: 0.6897 - val_accuracy: 0.5701 - val_loss: 0.6870
Epoch 2/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.5704 - loss: 0.6856 - val_accuracy: 0.5701 - val_loss: 0.6846
Epoch 3/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.5704 - loss: 0.6841 - val_accuracy: 0.5701 - val_loss: 0.6837
Epoch 4/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.5712 - loss: 0.7497 - val_accuracy: 0.5701 - val_loss: 0.6833
Epoch 5/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.5704 - loss: 0.6830 - val_accuracy: 0.5701 - val_loss: 0.6830
Epoch 6/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.5705 - loss: 0.6824 - val_accuracy: 0.5701 - val_loss: 0.6826
Epoch 7/20
[1m203/203

<keras.src.callbacks.history.History at 0x16a8be9b0d0>

In [58]:
y_train

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]], shape=(6471, 2))

In [62]:
X_test.shape

(3263, 33)

In [63]:
y_train.shape

(6471, 2)

In [64]:
y_val.shape

(1142, 2)

In [65]:
X_val.shape

(1142, 33)

In [66]:
y_pred = model.predict(X_test)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


In [67]:
y_pred

array([[9.78060067e-01, 2.19399650e-02],
       [9.22700747e-06, 9.99990821e-01],
       [1.03846794e-07, 9.99999881e-01],
       ...,
       [1.32626765e-13, 1.00000000e+00],
       [9.78060067e-01, 2.19399650e-02],
       [1.37969224e-07, 9.99999881e-01]], shape=(3263, 2), dtype=float32)

In [68]:
y_pred = [np.argmax(i) for i in y_pred]

In [74]:
df_sub.to_csv('sub.csv', index=False, index_label=False)

In [73]:
len(y_pred)

3263