<a href="https://colab.research.google.com/github/KimaniKibuthu/Disaster-Tweets/blob/main/Natural_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Disaster Tweets

Know which tweets are of disaster and which ones aren't

# Methodology
1. Data Obtainance
2. Data Preparation
3. Modelling

# Libraries and Variables

In [22]:
# Libraries

import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Embedding, Dense

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Data Obtainance


In [2]:
# Obtain data
def setup(kaggle_name, kaggle_key):
  # Setup the username and ID
  os.environ["KAGGLE_USERNAME"] = kaggle_name
  os.environ["KAGGLE_KEY"] = kaggle_key

  print('Done')

In [3]:
# Get data
setup('kimanikibuthu', 'f85c0bb5d43058fddcce7902e1325677')

!kaggle competitions download -c nlp-getting-started

Done
Downloading train.csv to /content
  0% 0.00/965k [00:00<?, ?B/s]
100% 965k/965k [00:00<00:00, 63.4MB/s]
Downloading test.csv to /content
  0% 0.00/411k [00:00<?, ?B/s]
100% 411k/411k [00:00<00:00, 130MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/22.2k [00:00<?, ?B/s]
100% 22.2k/22.2k [00:00<00:00, 20.2MB/s]


In [4]:
# Load into variables
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
submission = pd.read_csv('/content/sample_submission.csv')

# Data Preparation

**General Inspection**

In [5]:
# View the data
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
# View test data
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
# Check for missing values
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [8]:
# Get the text and target column
train_subset = train[['text', 'target']]

train_subset.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
# View target distribution
train_subset.target.value_counts(normalize=True)*100

0    57.034021
1    42.965979
Name: target, dtype: float64

**Text Preprocessing**

In [10]:
# text & target values
texts = train_subset['text'].values
target = train_subset['target'].values



In [11]:
# View texts to see preprocessing steps
for text in texts:
  print(text)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Our doctors and nurses in the new Pediatric Emergency Department are all specialized in child services! http://t.co/k1TMLWvjmJ
#MissionHills CA #Nursing : Registered Nurse - Emergency Department ( Full Time... at Providence Health &amp; Services http://t.co/Z5grLREy6V
Just saw a car on the I-77 Fully engulfed in flames hahah
Men escape car engulfed in flames in Parley's Canyon crews investigating cause - http://t.co/P6cyLz5lpt http://t.co/Jpu9gIps9f
Men escape car engulfed in flames in Parley's Canyon crews investigating cause - http://t.co/CYzlshlQhG http://t.co/nDiS8f1vzt
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam
#TRAFFICALERT  Eastbound 210 Freeway at Citrus Ave in Azusa. Two motorcycles involved in accident with one fully engulfed in flames in lanes
Men escape car engulfed in flames in Parley's Canyon crews investigating cause - http://t

In [12]:
# Define Tokenizer

def tokenizer(texts):
  # tokenize text
  for text in texts:
    tokens = word_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    # lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    text = ' '.join(tokens)
  
  tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
  tokenizer.fit_on_texts(texts)
  #Get the word index
  word_index = tokenizer.word_index
  words = len(word_index) + 1

  # Create padded sequences
  sequences = tokenizer.texts_to_sequences(texts)
  seq_length = max([len(x) for x in sequences])
  padded_sentences = pad_sequences(sequences, maxlen=seq_length, padding='post')

  return padded_sentences, words, word_index




In [13]:
# Preprocess text

def preprocess_text(texts):
  for text in texts:
    # Make lower
    text = text.lower()

    # Remove url
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove placeholders
    text = re.sub(r"\[video\]", '', text)
    text = re.sub(r'{link}', '', text)

    # HTML and non letter characters
    text = re.sub(r'&[a-z]+;', '', text)
    text = re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', text)

    # Remove mentions
    text = re.sub('@[\w]+','', text)

  # Tokenize texts
  padded_sentences, word_length, word_index = tokenizer(texts)

  return padded_sentences, word_length, word_index






In [14]:
# Prepare the texts for modelling
padded_texts, word_length, word_index = preprocess_text(texts)

In [15]:
# View shape
print(padded_texts.shape)
print(target.shape)


(7613, 33)
(7613,)


In [16]:
# Split data
x_train, x_prel, y_train, y_prel = train_test_split(padded_texts,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=42)

x_val, x_test, y_val, y_test = train_test_split(x_prel, y_prel,
                                                test_size= 0.5,
                                                random_state=42)

# Modelling
Two methods will be approached:
1. Using glove as an embedding layer and normal LSTM
2. Using BERT

## Glove & LSTM

In [17]:
# Get the embedding layer 
!wget --no-check-certificate \
    http://nlp.stanford.edu/data/glove.twitter.27B.zip \
    -o \content\glove.zip

In [18]:
#Unzip the data
!unzip /content/glove.twitter.27B.zip


Archive:  /content/glove.twitter.27B.zip
  inflating: glove.twitter.27B.25d.txt  
  inflating: glove.twitter.27B.50d.txt  
  inflating: glove.twitter.27B.100d.txt  
  inflating: glove.twitter.27B.200d.txt  


In [19]:
embeddings_index = {}
with open('/content/glove.twitter.27B.100d.txt', 'r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
  
  f.close()



In [20]:
# Create embeddings matrix

embedding_matrix = np.zeros((word_length, 100))
for word , i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [23]:
model = Sequential([
                        Embedding(word_length, 100, input_length=33, weights=[embedding_matrix], trainable=False),
                        Bidirectional(LSTM(100, return_sequences=True)),
                        Bidirectional(LSTM(100)),
                        Dense(16, activation='relu'),
                        Dense(16, activation='relu'),
                        Dense(1, activation='sigmoid')
])

model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])


In [24]:
history = model.fit(x_train, 
              y_train,
              batch_size = 32,
              epochs=50,
              validation_data = (x_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
