# **libraries**

In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
from nltk.corpus import stopwords
nltk.download('punkt') 
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,LSTM, Embedding, Dropout
from tensorflow.keras.models import Sequential
import os

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# **Exploaring the dataset**

In [2]:
df_train_data = pd.read_csv('/content/train.csv')
df_test_data = pd.read_csv('/content/test.csv')

In [3]:
df_train_data.head()

Unnamed: 0,ids,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
df_train_data.isna().sum() * 100 / len(df_train_data)

ids          0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64

In [6]:
df_test_data.isna().sum() * 100 / len(df_test_data)

id           0.000000
keyword      0.796813
location    33.864542
text         0.000000
dtype: float64

In [7]:
df_train_data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

# **Data Cleaning**

In [8]:
CLEANR = re.compile('<.*?>') 

In [9]:
# Remove Tags
def remove_tags(text):
  cleantext = re.sub(CLEANR, '', text)
  return cleantext

In [10]:
df_train_data['clean_text']= df_train_data['text'].apply(lambda x:remove_tags(x))
df_test_data['clean_text']= df_test_data['text'].apply(lambda x:remove_tags(x))

In [11]:
 # Remove Punctuation
def remove_punctuation(text):
  text = text.replace('"', "")
  text = text.replace("'", "")
  stop_chars = string.punctuation
  punctuationfree="".join([i for i in text if i not in stop_chars])
  return punctuationfree

In [12]:
df_train_data['clean_text']= df_train_data['clean_text'].apply(lambda x:remove_punctuation(x))
df_test_data['clean_text']= df_test_data['clean_text'].apply(lambda x:remove_punctuation(x))

In [13]:
# Remove_links
df_train_data['clean_text'] = df_train_data['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
df_test_data['clean_text'] = df_train_data['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [14]:
# lowering the text
df_train_data['clean_text']= df_train_data['clean_text'].apply(lambda x: x.lower())
df_test_data['clean_text']= df_test_data['clean_text'].apply(lambda x: x.lower())

In [15]:
def tokenization(text):
  tokens = re.split('\W+',text)
  return tokens

In [16]:
df_train_data['clean_text']= df_train_data['clean_text'].apply(lambda x:tokenization(x))
df_test_data['clean_text']= df_test_data['clean_text'].apply(lambda x:tokenization(x))

In [17]:
# Remove Stop Words
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [18]:
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [19]:
df_train_data['clean_text']= df_train_data['clean_text'].apply(lambda x:remove_stopwords(x))
df_test_data['clean_text']= df_test_data['clean_text'].apply(lambda x:remove_stopwords(x))

In [20]:
df_train_data.head(5)

Unnamed: 0,ids,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfires, evacuati..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [21]:
# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
#defining a function for stemming
def lemmatizer(text):
  lmem_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lmem_text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [22]:
df_train_data['clean_text']=df_train_data['clean_text'].apply(lambda x:lemmatizer(x))
df_test_data['clean_text']=df_test_data['clean_text'].apply(lambda x:lemmatizer(x))

In [23]:
df_train_data

Unnamed: 0,ids,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquake, may, allah, forgive..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[resident, asked, shelter, place, notified, of..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13, 000, people, receive, wildfire, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,"[two, giant, crane, holding, bridge, collapse,..."
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,"[, aria_ahrary, thetawniest, control, wild, fi..."
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[m1, 94, 01, 04, utc, 5km, volcano, hawaii, ]"
7611,10872,,,Police investigating after an e-bike collided ...,1,"[police, investigating, e, bike, collided, car..."


In [24]:
df_test_data

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,"[deed, reason, earthquake, may, allah, forgive..."
1,2,,,"Heard about #earthquake is different cities, s...","[forest, fire, near, la, ronge, sask, canada]"
2,3,,,"there is a forest fire at spot pond, geese are...","[resident, asked, shelter, place, notified, of..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,"[13, 000, people, receive, wildfire, evacuatio..."
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,"[got, sent, photo, ruby, alaska, smoke, wildfi..."
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,"[feel, engulfed, low, self, image, take, quiz, ]"
3259,10865,,,Storm in RI worse than last hurricane. My city...,"[man, equally, incapable, seeing, nothingness,..."
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,"[fully, engulfed, garage, fire, propane, tank,..."
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,"[came, land, engulfed, tribal, war, turned, la..."


# **Counter**

In [25]:
stop_words  = set(stopwords)
text_to_str = ' '.join(df_train_data['text'])
txt_tokenization = (nltk.word_tokenize(text_to_str))
txt= [word for word in txt_tokenization if word.isalnum()]
print(txt)



In [26]:
stop_word=[]
for i in stop_words:
   for j in txt:
    if i == j:
        stop_word.append(i)
#print(stop_word)
Counters_found = Counter(stop_word)
most_occur = Counters_found.most_common(15)
print ("most common stop words is : ", most_occur) 

most common stop words is :  [('the', 2591), ('a', 1872), ('to', 1814), ('in', 1777), ('of', 1729), ('and', 1313), ('is', 840), ('for', 826), ('on', 789), ('you', 740), ('it', 605), ('that', 551), ('my', 550), ('with', 517), ('at', 491)]


In [27]:
word=[]
for w in txt_tokenization:
    if w not in stop_words and w.isalnum():
      word.append(w)
Counters_Found = Counter(word)
most_Occur = Counters_Found.most_common(15)
print ("most common words is : ", most_Occur)

most common words is :  [('http', 4307), ('I', 1546), ('The', 557), ('https', 409), ('amp', 344), ('like', 321), ('A', 300), ('via', 212), ('get', 184), ('fire', 175), ('people', 166), ('2', 162), ('In', 159), ('one', 156), ('would', 128)]


In [28]:
df_train_data['keyword'].value_counts(15)

fatalities               0.005959
deluge                   0.005561
armageddon               0.005561
sinking                  0.005429
damage                   0.005429
                           ...   
forest%20fire            0.002516
epicentre                0.001589
threat                   0.001457
inundation               0.001324
radiation%20emergency    0.001192
Name: keyword, Length: 221, dtype: float64

In [29]:
df_test_data['keyword'].value_counts(15)

deluged               0.007105
demolished            0.006796
rubble                0.006796
first%20responders    0.006487
seismic               0.006487
                        ...   
threat                0.001545
fatalities            0.001545
forest%20fire         0.001545
inundation            0.001236
epicentre             0.000309
Name: keyword, Length: 221, dtype: float64

In [30]:
df_train_data['location'].value_counts(15)

USA                    0.020472
New York               0.013976
United States          0.009843
London                 0.008858
Canada                 0.005709
                         ...   
MontrÌ©al, QuÌ©bec     0.000197
Montreal               0.000197
ÌÏT: 6.4682,3.18287    0.000197
Live4Heed??            0.000197
Lincoln                0.000197
Name: location, Length: 3341, dtype: float64

In [31]:
df_test_data['location'].value_counts(15)

New York                  0.017609
USA                       0.017146
Worldwide                 0.007414
United States             0.006951
London                    0.006024
                            ...   
Medford, NJ               0.000463
Quezon City               0.000463
LanÌ¼s                    0.000463
USA,Washington,Seattle    0.000463
Brussels, Belgium         0.000463
Name: location, Length: 1602, dtype: float64

#**LSTM Model**

In [32]:
df_train_data.drop(['ids', 'keyword', 'location', 'text'], axis=1, inplace=True)
df_test_data.drop(['id', 'keyword', 'location', 'text'], axis=1, inplace=True)

In [33]:
train_text = df_train_data['clean_text']
y = df_train_data['target']

In [34]:
max_len = 200
max_words = 20000

In [35]:
tokenizer = Tokenizer(num_words=max_words)

In [36]:
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index
len(word_index)


15696

In [37]:
from numpy.ma.core import shape
sequences = tokenizer.texts_to_sequences(train_text)
X = pad_sequences(sequences, maxlen=max_len)


In [38]:
test_data = tokenizer.texts_to_sequences(df_test_data['clean_text'])
test_data = pad_sequences(test_data, maxlen=max_len)
test_data.shape

(3263, 200)

In [39]:
y = np.array(y).reshape((-1,1))
y.shape

(7613, 1)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, test_size=0.15)

In [41]:
model = Sequential()
model.add(Embedding(max_words, 50))
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(LSTM(20, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 50)          1000000   
                                                                 
 lstm (LSTM)                 (None, None, 50)          20200     
                                                                 
 lstm_1 (LSTM)               (None, 20)                5680      
                                                                 
 dropout (Dropout)           (None, 20)                0         
                                                                 
 dense (Dense)               (None, 10)                210       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,026,101
Trainable params: 1,026,101
Non-

In [42]:
epochs = 4
batch_size = 32
history = model.fit(X,y,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,
          shuffle=True)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [43]:
checkpoint_path = "/content/sample_data/trained"
checkpoint_dir = os.path.dirname(checkpoint_path)
the_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
