In [6]:
##  Classify disaster tweets
# Source : https://www.kaggle.com/competitions/nlp-getting-started/data

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

plt.style.use(style='seaborn')
%matplotlib inline

In [7]:
## Load the data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

In [8]:
train.head().T

Unnamed: 0,0,1,2,3,4
id,1,4,5,6,7
keyword,,,,,
location,,,,,
text,Our Deeds are the Reason of this #earthquake M...,Forest fire near La Ronge Sask. Canada,All residents asked to 'shelter in place' are ...,"13,000 people receive #wildfires evacuation or...",Just got sent this photo from Ruby #Alaska as ...
target,1,1,1,1,1


In [9]:
test.head().T

Unnamed: 0,0,1,2,3,4
id,0,2,3,9,11
keyword,,,,,
location,,,,,
text,Just happened a terrible car crash,"Heard about #earthquake is different cities, s...","there is a forest fire at spot pond, geese are...",Apocalypse lighting. #Spokane #wildfires,Typhoon Soudelor kills 28 in China and Taiwan


In [11]:
## Remove URLs and HTML
import re

def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

In [12]:
def remove_emoji(string):
    emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002702-\U000027B0"  
    u"\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE,
    )
    return emoji_pattern.sub(r"", string)

In [13]:
# Remove puntuation
import string

def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

In [15]:
train["text"] = train.text.map(lambda x: remove_URL(x))
train["text"] = train.text.map(lambda x: remove_html(x))
train["text"] = train.text.map(lambda x: remove_emoji(x))
train["text"] = train.text.map(lambda x: remove_punct(x))

In [22]:
# import nltk
# nltk.download('stopwords')

In [24]:
# Remove stopwords
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    
    return " ".join(text)

In [25]:
train["text"] = train["text"].map(remove_stopwords)

In [26]:
train.text # cleaning is done!

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [31]:
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eraym\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [32]:
## Embeddings
# Using GloVe

from nltk.tokenize import word_tokenize
import nltk

def create_corpus_tk(df):
    corpus = []
    for text in train["text"]:
        words = [word.lower() for word in word_tokenize(text)]
        corpus.append(words)
    return corpus

In [33]:
corpus = create_corpus_tk(train)

In [34]:
num_words = len(corpus)
print(num_words)

7613


In [35]:
corpus[0]

['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']

In [36]:
# Train/test split
train_size = int(train.shape[0] * 0.8)

train_sentences = train.text[:train_size]
train_labels = train.target[:train_size]

test_sentences = train.text[:train_size]
test_labels = train.target[:train_size]

In [44]:
# !pip install keras
# !pip install tensorflow

In [43]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

max_len = 50

In [45]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_sentences)

In [53]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [54]:
train_padded = pad_sequences(
    train_sequences, maxlen=max_len, truncating="post", padding="post")

In [55]:
train_padded

array([[3739,  696,  235, ...,    0,    0,    0],
       [  71,    3,  129, ...,    0,    0,    0],
       [1448, 1186, 1882, ...,    0,    0,    0],
       ...,
       [ 151,    1, 1256, ...,    0,    0,    0],
       [1256,  448,   15, ...,    0,    0,    0],
       [ 151,  204,  539, ...,    0,    0,    0]])

In [56]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded   = pad_sequences(
    test_sequences, maxlen=max_len, truncating="post", padding="post")

In [57]:
test_padded

array([[3739,  696,  235, ...,    0,    0,    0],
       [  71,    3,  129, ...,    0,    0,    0],
       [1448, 1186, 1882, ...,    0,    0,    0],
       ...,
       [ 151,    1, 1256, ...,    0,    0,    0],
       [1256,  448,   15, ...,    0,    0,    0],
       [ 151,  204,  539, ...,    0,    0,    0]])

In [58]:
print(train.text[0])
print(train_sequences[0])

deeds reason earthquake may allah forgive us
[3739, 696, 235, 41, 1282, 3740, 14]


In [59]:
word_index = tokenizer.word_index
print("Number of unique words:", len(word_index))

Number of unique words: 15470


In [62]:
word_index["turkish"]

1554

In [63]:
print(test_sequences[0])

[3739, 696, 235, 41, 1282, 3740, 14]


In [65]:
print(train.text[train_size])

nowplaying sinking fast never north east unsigned radio listen


In [84]:
# Create the embedding dictionary
# Source: https://nlp.stanford.edu/projects/glove/
embedding_dict = {}
with open("glove.twitter.27B/glove.twitter.27B.100d.txt","r", encoding="utf8") as f:
    for line in f:
        values  = line.split()
        word    = values[0]
        vectors = np.asarray(values[1:], "float32")
        embedding_dict[word] = vectors
f.close()

In [85]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 100))

for word, i in word_index.items():
    if i<num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec

In [86]:
## Baseline Model with GloVe
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

model = Sequential()

model.add(
    Embedding(
        num_words,
        100,
        embeddings_initializer=Constant(embedding_matrix),
        input_length=max_len,
        trainable=False,
    )
)
model.add(LSTM(100, dropout=0.1))
model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=3e-4)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [87]:
history = model.fit(
    train_padded,
    train_labels,
    epochs=20,
    validation_data=(test_padded,test_labels),
    verbose=1,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [88]:
sequences = tokenizer.texts_to_sequences(test.text)
padded    = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

In [89]:
pred     = model.predict(padded)
pred_int = pred.round().astype('int')



In [90]:
pred

array([[0.86226964],
       [0.9131665 ],
       [0.9808945 ],
       ...,
       [0.986975  ],
       [0.72680193],
       [0.6390544 ]], dtype=float32)

In [91]:
padded[5]

array([235,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [92]:
model.predict(padded[5].reshape(1,-1))



array([[0.9538511]], dtype=float32)

In [93]:
# Check inverse
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

In [94]:
def decode(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [95]:
decode(sequences[5])

'earthquake'