# 1- Imports

In [1]:
import numpy as np 
import pandas as pd 
import re
import string
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train.shape

(7613, 5)

In [8]:
print((train.target == 1).sum()) # Disaster
print((train.target == 0).sum()) # No Disaster

3271
4342


# 2- Preprocessing

In [9]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

In [11]:
#regex pattern to remove links
pattern = re.compile(r"https?://(\S+|www)\.\S+")
#for train
for t in train.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print('After Transformation:')
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
After Transformation:
@bbcmtd Wholesale Markets ablaze 


In [12]:
#for test:
for t in test.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print('After Transformation:')
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
After Transformation:
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market 


In [13]:
#preprocess data frames:
#train
train["text"] = train.text.map(remove_URL) 
train["text"] = train.text.map(remove_punct)
#test
test["text"] = test.text.map(remove_URL) 
test["text"] = test.text.map(remove_punct)

In [14]:
# remove stopwords
nltk.download('stopwords')

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [15]:
stop

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [16]:
#train
train["text"] = train.text.map(remove_stopwords)
#test
test["text"] = test.text.map(remove_stopwords)

In [17]:
#Check
train.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [18]:
# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count


counter = counter_word(train.text)

In [19]:
len(counter)

17971

In [20]:
# counter

In [21]:
counter.most_common(5)

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

In [22]:
num_unique_words = len(counter)
num_unique_words

17971

In [23]:
# Split dataset into training and validation set
X = train.text
y = train.target
train_sentences, val_sentences , train_labels, val_labels = train_test_split(X, y, test_size=0.2)

In [24]:
#train/val
train_sentences = train_sentences.to_numpy()
train_labels = train_labels.to_numpy()
val_sentences = val_sentences.to_numpy()
val_labels = val_labels.to_numpy()

In [25]:
#test
test_sentences = test.text.to_numpy()

In [26]:
train_sentences.shape, val_sentences.shape

((6090,), (1523,))

In [27]:
# Tokenize
# vectorize a text corpus by turning each text into a sequence of integers

tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training

In [28]:
# Now each word has unique index
word_index = tokenizer.word_index
word_index

{'like': 1,
 'amp': 2,
 'im': 3,
 'get': 4,
 'fire': 5,
 'via': 6,
 'new': 7,
 'people': 8,
 'one': 9,
 'news': 10,
 'dont': 11,
 'video': 12,
 '2': 13,
 'us': 14,
 'emergency': 15,
 'disaster': 16,
 'would': 17,
 'still': 18,
 'suicide': 19,
 'police': 20,
 'back': 21,
 'storm': 22,
 'man': 23,
 'body': 24,
 'crash': 25,
 'know': 26,
 'first': 27,
 'day': 28,
 'bomb': 29,
 'got': 30,
 'california': 31,
 'burning': 32,
 'buildings': 33,
 'world': 34,
 'time': 35,
 'rt': 36,
 'youtube': 37,
 'see': 38,
 'fires': 39,
 'dead': 40,
 'going': 41,
 'nuclear': 42,
 'attack': 43,
 '3': 44,
 'love': 45,
 'two': 46,
 'cant': 47,
 'train': 48,
 'think': 49,
 'families': 50,
 'life': 51,
 'killed': 52,
 'go': 53,
 'hiroshima': 54,
 'u': 55,
 'many': 56,
 'good': 57,
 'could': 58,
 'war': 59,
 'may': 60,
 'today': 61,
 'watch': 62,
 'way': 63,
 'car': 64,
 'years': 65,
 'full': 66,
 'accident': 67,
 'say': 68,
 'want': 69,
 'last': 70,
 'work': 71,
 'make': 72,
 'collapse': 73,
 'year': 74,
 'mass'

In [29]:
#apply on train, validation, and test sentences

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [30]:
#Check
print(train_sentences[10:15])
print(train_sequences[10:15])

['agreed especially automatic weapons theres legitimate reason needing one argus99 heidia1438'
 'think akwa ibom don\x89ûªt come uruan demolish buildings exassembly member warns udom emmanuel'
 'eversafe emergency auto kit weather unsafe hazardous rushhour gridlock jumperca\x89û'
 'goooooooaaaaaal'
 'oil gas exploration takes seismic shift gabon somalia bloomberg somalia']
[[1918, 1662, 1663, 208, 185, 5639, 647, 5640, 9, 5641, 5642], [49, 5643, 5644, 769, 147, 5645, 317, 33, 5646, 2805, 1919, 5647, 5648], [5649, 15, 3707, 1193, 139, 2806, 354, 5650, 3708, 5651], [5652], [119, 833, 1474, 834, 519, 951, 2261, 1664, 3709, 1664]]


In [31]:
# Pad the sequences to have the same length
max_length = 15 #arbitrary number

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post") #post-> 0
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

In [32]:
#Check
train_padded.shape, val_padded.shape

((6090, 15), (1523, 15))

In [33]:
#Check
train_padded[10]

array([1918, 1662, 1663,  208,  185, 5639,  647, 5640,    9, 5641, 5642,
          0,    0,    0,    0])

In [34]:
#Check
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

agreed especially automatic weapons theres legitimate reason needing one argus99 heidia1438
[1918, 1662, 1663, 208, 185, 5639, 647, 5640, 9, 5641, 5642]
[1918 1662 1663  208  185 5639  647 5640    9 5641 5642    0    0    0
    0]


In [35]:
# flip (key, value)
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [36]:
#Check
reverse_word_index

{1: 'like',
 2: 'amp',
 3: 'im',
 4: 'get',
 5: 'fire',
 6: 'via',
 7: 'new',
 8: 'people',
 9: 'one',
 10: 'news',
 11: 'dont',
 12: 'video',
 13: '2',
 14: 'us',
 15: 'emergency',
 16: 'disaster',
 17: 'would',
 18: 'still',
 19: 'suicide',
 20: 'police',
 21: 'back',
 22: 'storm',
 23: 'man',
 24: 'body',
 25: 'crash',
 26: 'know',
 27: 'first',
 28: 'day',
 29: 'bomb',
 30: 'got',
 31: 'california',
 32: 'burning',
 33: 'buildings',
 34: 'world',
 35: 'time',
 36: 'rt',
 37: 'youtube',
 38: 'see',
 39: 'fires',
 40: 'dead',
 41: 'going',
 42: 'nuclear',
 43: 'attack',
 44: '3',
 45: 'love',
 46: 'two',
 47: 'cant',
 48: 'train',
 49: 'think',
 50: 'families',
 51: 'life',
 52: 'killed',
 53: 'go',
 54: 'hiroshima',
 55: 'u',
 56: 'many',
 57: 'good',
 58: 'could',
 59: 'war',
 60: 'may',
 61: 'today',
 62: 'watch',
 63: 'way',
 64: 'car',
 65: 'years',
 66: 'full',
 67: 'accident',
 68: 'say',
 69: 'want',
 70: 'last',
 71: 'work',
 72: 'make',
 73: 'collapse',
 74: 'year',
 75: 'm

In [37]:
#decoding
def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

In [38]:
decoded_text = decode(train_sequences[10])
#Check
print(train_sequences[10])
print(decoded_text)

[1918, 1662, 1663, 208, 185, 5639, 647, 5640, 9, 5641, 5642]
agreed especially automatic weapons theres legitimate reason needing one argus99 heidia1438


# 3- Modeling

In [39]:
# Create LSTM model

# Embedding: Turns positive integers (indexes) into dense vectors of fixed size.

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 100, input_length=max_length))

model.add(layers.LSTM(32, dropout=0.25))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 100)           1797100   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                17024     
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 1,814,157
Trainable params: 1,814,157
Non-trainable params: 0
_________________________________________________________________


In [40]:
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [41]:
model.fit(train_padded, train_labels, epochs=25, validation_data=(val_padded, val_labels), verbose=2)

Epoch 1/25
191/191 - 8s - loss: 0.5397 - accuracy: 0.7189 - val_loss: 0.4470 - val_accuracy: 0.8043
Epoch 2/25
191/191 - 5s - loss: 0.2877 - accuracy: 0.8895 - val_loss: 0.5125 - val_accuracy: 0.7840
Epoch 3/25
191/191 - 5s - loss: 0.1534 - accuracy: 0.9501 - val_loss: 0.5904 - val_accuracy: 0.7761
Epoch 4/25
191/191 - 5s - loss: 0.0998 - accuracy: 0.9663 - val_loss: 0.6103 - val_accuracy: 0.7768
Epoch 5/25
191/191 - 5s - loss: 0.0689 - accuracy: 0.9765 - val_loss: 0.6808 - val_accuracy: 0.7807
Epoch 6/25
191/191 - 6s - loss: 0.0538 - accuracy: 0.9808 - val_loss: 0.7980 - val_accuracy: 0.7623
Epoch 7/25
191/191 - 5s - loss: 0.0409 - accuracy: 0.9836 - val_loss: 0.9175 - val_accuracy: 0.7728
Epoch 8/25
191/191 - 5s - loss: 0.0376 - accuracy: 0.9834 - val_loss: 1.1326 - val_accuracy: 0.7439
Epoch 9/25
191/191 - 5s - loss: 0.0418 - accuracy: 0.9831 - val_loss: 1.0029 - val_accuracy: 0.7603
Epoch 10/25
191/191 - 5s - loss: 0.0339 - accuracy: 0.9859 - val_loss: 1.3126 - val_accuracy: 0.7669

<tensorflow.python.keras.callbacks.History at 0x1c5b3c2e220>

In [42]:
predictions = model.predict(test_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]

In [45]:
submission = pd.DataFrame({'id':test[['id','text']].values.tolist(),'target':predictions})

In [51]:
submission.sample(10)

Unnamed: 0,id,target
1007,"[3316, got wisdom teeth yesterday demolished w...",0
1814,"[6136, apc needs watch pdp defecting apc patie...",1
681,"[2208, something catastrophic coming tune via ...",1
2615,"[8731, heart sinking like sunset]",1
2094,"[7028, tonight going mayhem 4playthursdays eve...",0
266,"[862, fedex longer transport bioterror germs v...",0
2544,"[8485, cokeboys yo best screamed watched snap ...",1
107,"[362, new souls punished annihilation]",0
75,"[250, twelve feared killed pakistani air ambul...",1
2259,"[7511, california spring oil spill estimate gr...",1
