In [2]:
!pip install numpy



In [17]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
Collecting python-slugify
  Downloading python_slugify-5.0.2-py2.py3-none-any.whl (6.7 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73053 sha256=7103b618771a92db388b7c27961077ddd91af5853d910f02c472be81d2a72695
  Stored in directory: c:\users\jaysi\appdata\local\pip\cache\wheels\29\da\11\144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.5.12 python-slugify-5.0.2 text-unidecode-1.3


In [3]:
!pip install tensorflow



In [4]:
!pip install pandas



In [5]:
!pip install matplotlib



In [25]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import requests

In [26]:
df = pd.read_csv("train.csv")

In [27]:
df.shape

(7613, 5)

In [28]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [29]:
print((df.target == 1).sum()) #Disaster
print((df.target == 0).sum()) #No disaster

3271
4342


In [30]:
# Preprocessing
import re
import string

def remove_URL(text):
    url = re.compile(r"https?//\S+|www\.S+")
    return url.sub(r"", text)

def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [31]:
pattern = re.compile(r"https?://(S+|www)\.\S+")
for t in df.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

Dozens Die As two Trains Derail Into A River In Indiahttp://www.informationng.com/?p=309943
Dozens Die As two Trains Derail Into A River In India


In [32]:
df['text'] = df.text.map(remove_URL)
df['text'] = df.text.map(remove_punct)

In [33]:
!pip install nltk



In [34]:
# Remove stopwords
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jaysi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
df['text'] = df.text.map(remove_stopwords)

In [36]:
df.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610    m194 0104 utc5km volcano hawaii httptcozdtoyd8ebj
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [37]:
from collections import Counter

# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count
        
counter = counter_word(df.text)

In [38]:
len(counter)

22564

In [39]:
counter

Counter({'deeds': 2,
         'reason': 20,
         'earthquake': 50,
         'may': 88,
         'allah': 9,
         'forgive': 2,
         'us': 164,
         'forest': 65,
         'fire': 250,
         'near': 54,
         'la': 25,
         'ronge': 1,
         'sask': 1,
         'canada': 11,
         'residents': 8,
         'asked': 9,
         'shelter': 6,
         'place': 26,
         'notified': 1,
         'officers': 8,
         'evacuation': 50,
         'orders': 11,
         'expected': 15,
         '13000': 4,
         'people': 196,
         'receive': 2,
         'wildfires': 11,
         'california': 117,
         'got': 112,
         'sent': 13,
         'photo': 41,
         'ruby': 1,
         'alaska': 6,
         'smoke': 48,
         'pours': 1,
         'school': 66,
         'rockyfire': 4,
         'update': 37,
         'hwy': 9,
         '20': 26,
         'closed': 20,
         'directions': 1,
         'due': 31,
         'lake': 14,
         'co

In [40]:
counter.most_common(5)

[('like', 345), ('im', 299), ('amp', 298), ('fire', 250), ('get', 229)]

In [41]:
num_unique_words = len(counter)

In [42]:
num_unique_words

22564

In [48]:
# split the dataset into training and validation set
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
val_df = df[train_size:]

#split text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = val_df.text.to_numpy()
val_labels = val_df.target.to_numpy()



In [49]:
train_sentences.shape, val_sentences.shape

((6090,), (1523,))

In [52]:
# Tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

#vectorize a text corpus by turning each text into a sequence of integers
tokenizer = Tokenizer(num_words = num_unique_words)
tokenizer.fit_on_texts(train_sentences) # fit only to training


In [53]:
# each word has unique index
word_index = tokenizer.word_index

In [54]:
word_index

{'like': 1,
 'amp': 2,
 'fire': 3,
 'im': 4,
 'get': 5,
 'via': 6,
 'new': 7,
 'people': 8,
 'news': 9,
 'dont': 10,
 'emergency': 11,
 'one': 12,
 '2': 13,
 'us': 14,
 'video': 15,
 'disaster': 16,
 'burning': 17,
 'body': 18,
 'would': 19,
 'buildings': 20,
 'police': 21,
 'crash': 22,
 'first': 23,
 'california': 24,
 'still': 25,
 'man': 26,
 'got': 27,
 'know': 28,
 'back': 29,
 'day': 30,
 'going': 31,
 'two': 32,
 'time': 33,
 'full': 34,
 'accident': 35,
 'see': 36,
 'world': 37,
 'attack': 38,
 'nuclear': 39,
 'youtube': 40,
 'may': 41,
 'love': 42,
 'go': 43,
 'rt': 44,
 'many': 45,
 'cant': 46,
 '3': 47,
 'watch': 48,
 'collapse': 49,
 'dead': 50,
 'today': 51,
 'car': 52,
 'mass': 53,
 'want': 54,
 'years': 55,
 'work': 56,
 'train': 57,
 'last': 58,
 'good': 59,
 'think': 60,
 'families': 61,
 'hiroshima': 62,
 'life': 63,
 'fires': 64,
 'best': 65,
 'could': 66,
 'say': 67,
 'u': 68,
 'death': 69,
 'hot': 70,
 'forest': 71,
 'way': 72,
 'killed': 73,
 'need': 74,
 'legion

In [55]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [56]:
print(train_sentences[10:15])
print(train_sequences[10:15])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking']
[[520, 8, 395, 156, 297, 411], [748, 470, 2251, 138, 2252, 2825, 521, 611, 188, 470, 2251, 189, 189, 5735, 117], [2826, 117, 1886, 5736, 2251, 1284, 1450, 522, 255, 644, 2827], [99, 3759, 612, 1451, 3759], [111, 91, 336, 3760, 3761, 52, 22, 312]]


In [59]:
# Pad the sequenes to have the same length
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Max number of words in a sequence
max_length = 20

train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = 'post', truncating = 'post')
val_padded = pad_sequences(val_sequences, maxlen = max_length, padding = "post", truncating = 'post')

train_padded.shape, val_padded.shape


((6090, 20), (1523, 20))

In [60]:
train_padded[10]

array([520,   8, 395, 156, 297, 411,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])

In [62]:
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

three people died heat wave far
[520, 8, 395, 156, 297, 411]
[520   8 395 156 297 411   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


In [63]:
# Check reversing the indices
# flip (key, value)

reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

In [64]:
reverse_word_index

{1: 'like',
 2: 'amp',
 3: 'fire',
 4: 'im',
 5: 'get',
 6: 'via',
 7: 'new',
 8: 'people',
 9: 'news',
 10: 'dont',
 11: 'emergency',
 12: 'one',
 13: '2',
 14: 'us',
 15: 'video',
 16: 'disaster',
 17: 'burning',
 18: 'body',
 19: 'would',
 20: 'buildings',
 21: 'police',
 22: 'crash',
 23: 'first',
 24: 'california',
 25: 'still',
 26: 'man',
 27: 'got',
 28: 'know',
 29: 'back',
 30: 'day',
 31: 'going',
 32: 'two',
 33: 'time',
 34: 'full',
 35: 'accident',
 36: 'see',
 37: 'world',
 38: 'attack',
 39: 'nuclear',
 40: 'youtube',
 41: 'may',
 42: 'love',
 43: 'go',
 44: 'rt',
 45: 'many',
 46: 'cant',
 47: '3',
 48: 'watch',
 49: 'collapse',
 50: 'dead',
 51: 'today',
 52: 'car',
 53: 'mass',
 54: 'want',
 55: 'years',
 56: 'work',
 57: 'train',
 58: 'last',
 59: 'good',
 60: 'think',
 61: 'families',
 62: 'hiroshima',
 63: 'life',
 64: 'fires',
 65: 'best',
 66: 'could',
 67: 'say',
 68: 'u',
 69: 'death',
 70: 'hot',
 71: 'forest',
 72: 'way',
 73: 'killed',
 74: 'need',
 75: 'le

In [66]:
def decode(sequence):
    return " ".join(reverse_word_index.get(idx, "?") for idx in sequence)

In [67]:
# checking if decoding is correct
decoded_text = decode(train_sequences[10])

print(train_sequences[10])
print(decoded_text)

[520, 8, 395, 156, 297, 411]
three people died heat wave far


In [101]:
# Create LSTM model
from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length = max_length))

model.add(layers.LSTM(12, dropout = 0.4))
model.add(layers.Dense(1, activation = 'sigmoid'))

model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 20, 32)            722048    
_________________________________________________________________
lstm_8 (LSTM)                (None, 12)                2160      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 13        
Total params: 724,221
Trainable params: 724,221
Non-trainable params: 0
_________________________________________________________________


In [104]:
loss = keras.losses.BinaryCrossentropy(from_logits = False)
optim = keras.optimizers.Adam(lr = 0.1)
metrics = ['accuracy']

model.compile(loss = loss, optimizer = optim, metrics = metrics)

In [105]:
model.fit(train_padded, train_labels, epochs = 5, validation_data = (val_padded, val_labels), verbose = 2)

Epoch 1/5
191/191 - 4s - loss: 0.1800 - accuracy: 0.9353 - val_loss: 0.8658 - val_accuracy: 0.6743
Epoch 2/5
191/191 - 2s - loss: 0.2992 - accuracy: 0.8773 - val_loss: 0.7691 - val_accuracy: 0.6428
Epoch 3/5
191/191 - 2s - loss: 0.3140 - accuracy: 0.8686 - val_loss: 0.9137 - val_accuracy: 0.6546
Epoch 4/5
191/191 - 2s - loss: 0.2893 - accuracy: 0.8798 - val_loss: 0.8236 - val_accuracy: 0.6592
Epoch 5/5
191/191 - 2s - loss: 0.2874 - accuracy: 0.8837 - val_loss: 0.7531 - val_accuracy: 0.6625


<tensorflow.python.keras.callbacks.History at 0x1ed277d2d30>

In [106]:
prediction = model.predict(train_padded)
prediction = [1 if p > 0.5 else 0 for p in prediction]

In [107]:
print(train_sentences[10:20])
print(train_labels[10:20])
print(prediction[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking' 'whats man' 'love fruits'
 'summer lovely' 'car fast' 'goooooooaaaaaal']
[1 1 1 1 1 0 0 0 0 0]
[1, 1, 1, 1, 1, 0, 0, 0, 1, 0]
