In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

### Load the dataset

In [2]:
data=pd.read_csv('train.csv')

### Check head and info of the data

In [3]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


### Checking for duplicates

In [5]:
 data.duplicated().sum()

0

### Is there a missing data [how many and the precentage if there]?

In [6]:
data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
missing_data_percentage_percolumn= data.isna().sum() * 100 / len(data)
print('Percentage of missing data is: ',missing_data_percentage_percolumn[1] + missing_data_percentage_percolumn[2],'%')

Percentage of missing data is:  34.073295678444765 %


### How many data in each class?

In [8]:
data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

##### There are 3271 disaster tweets and 4342 that are not

### Get the top 15 locations of the data

In [229]:
data['location'].value_counts().head(15)  

USA                104
New York            71
United States       50
London              45
Canada              29
Nigeria             28
UK                  27
Los Angeles, CA     26
India               24
Mumbai              22
Washington, DC      21
Kenya               20
Worldwide           19
Australia           18
Chicago, IL         18
Name: location, dtype: int64

### Get the top 15 keywords of the data

In [230]:
data['keyword'].value_counts().head(15) 

collision           36
whirlwind           33
armageddon          32
fatalities          32
flames              31
emergency%20plan    31
derailed            31
outbreak            31
sandstorm           31
danger              30
inundated           30
harm                30
damage              30
desolation          30
upheaval            30
Name: keyword, dtype: int64

## Preprocessing

In [9]:
data=data.dropna()

In [10]:
data.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5080 entries, 31 to 7581
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5080 non-null   int64 
 1   keyword   5080 non-null   object
 2   location  5080 non-null   object
 3   text      5080 non-null   object
 4   target    5080 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 238.1+ KB


## Preprocessing the tweets

#### Downloading the english stopwords

In [12]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lapcell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### What are the most common stop words?

In [13]:
def get_stopwords(tweets):
    stopwords=[]
    for tweet in tweets:
        for word in tweet.split():
            if word.lower() in stop_words:
                stopwords.append(word.lower())
    return stopwords

In [14]:
common_stopwords=get_stopwords(data['text'])

#### Applying the lemmatizer

In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lapcell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
from nltk.stem import WordNetLemmatizer
le=WordNetLemmatizer()

In [254]:
def clean_tweet(tweets):
    clean_tweets=[]
    for tweet in tweets:
         #Removing mentions and tags
        tweet=re.sub(r'(#|@)[a-zA-Z0-9_]+','',tweet)
         #Removing links
        tweet=re.sub(r'https?:\/\/\S+','',tweet)
         #Removing punctuations
        tweet=re.sub('\W',' ',tweet)
         #Removing digits
        tweet=re.sub(r'\s[0-9]+\s','',tweet)
         #Removing stand alone characters
        tweet=re.sub('\s[a-zA-Z]\s','',tweet)
         #Removing spaces at the beginning
        tweet=re.sub('^\s+','',tweet)
         #Removing spaces at the end
        tweet=re.sub('\s+$','',tweet)
        
        clean_tweet=[le.lemmatize(word.lower()) for word in tweet.split()  if word.lower() not in stop_words]
        clean_tweets.append(clean_tweet)
    return clean_tweets

## Building the frequency table

In [18]:
def build_freq_dict(tweets,target):
    freq={}
    #key=word
    for i in range(len(tweets)):
        for word in tweets[i]:
            key=word
            if key not in freq.keys():
                if target[i]==1:        #disasterous tweet
                    freq[key]=[1,0]
                else:                   #non-disasterous tweet
                    freq[key]=[0,1]
            else:
                if target[i]==1:
                    freq[key][0]+=1
                else:
                    freq[key][1]+=1
    return freq

In [19]:
text=clean_tweet(data['text'].values)

In [20]:
tweets=[]
target=[]
for i in range(len(data['text'])):
    tweets.append(text[i])
    target.append(data['target'].values[i])

In [21]:
frequency_dict=build_freq_dict(tweets,target)

## Building features

### Applying the tokenizer to convert each word to a unique number

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [23]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(tweets)
sequence=tokenizer.texts_to_sequences(tweets)
#print("Before Tokenization: \n",tweets)
#print("After Tokenization: \n",sequence)
#len(sequence)

### Making all inputs have the same size

In [24]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
padded_sequence=pad_sequences(sequence,maxlen=100)

## Splitting the data into train and test data

In [26]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y=train_test_split(np.array(padded_sequence),np.array(target),test_size=0.2,random_state=0)

In [27]:
print('train_x shape: ',train_X.shape)
print('train_y shape: ',train_y.shape)
print('test_x shape: ',test_X.shape)
print('test_x shape: ',test_y.shape)

train_x shape:  (4064, 100)
train_y shape:  (4064,)
test_x shape:  (1016, 100)
test_x shape:  (1016,)


### Reshaping the data for the LSTM Model

In [28]:
train_X=train_X.reshape(train_X.shape[0],1,train_X.shape[1])
test_X=test_X.reshape(test_X.shape[0],1,test_X.shape[1])
train_y=train_y.reshape(-1,1)
test_y=test_y.reshape(-1,1)

In [29]:
print('train_x shape: ',train_X.shape)
print('train_y shape: ',train_y.shape)
print('test_x shape: ',test_X.shape)
print('test_x shape: ',test_y.shape)

train_x shape:  (4064, 1, 100)
train_y shape:  (4064, 1)
test_x shape:  (1016, 1, 100)
test_x shape:  (1016, 1)


## LSTM Model

In [128]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Dropout

In [188]:
model=Sequential()
model.add(LSTM(50,activation='relu',input_shape=(train_X.shape[1],train_X.shape[2])))  
model.add(Dropout(0.3))
model.add(Dense(50,activation='relu'))
model.add(Dense(250,activation='relu'))
model.add(Dense(250,activation='relu'))
model.add(Dense(350,activation='relu'))
model.add(Dense(250,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(150,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(50,activation='relu'))


model.add(Dense(150,activation='relu'))

model.add(Dense(100,activation='relu'))
model.add(Dense(150,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(150,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(150,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(150,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(100,activation='relu'))


model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_20 (LSTM)              (None, 50)                30200     
                                                                 
 dropout_20 (Dropout)        (None, 50)                0         
                                                                 
 dense_610 (Dense)           (None, 50)                2550      
                                                                 
 dense_611 (Dense)           (None, 250)               12750     
                                                                 
 dense_612 (Dense)           (None, 250)               62750     
                                                                 
 dense_613 (Dense)           (None, 350)               87850     
                                                                 
 dense_614 (Dense)           (None, 250)             

In [189]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [190]:
model.fit(train_X,train_y,epochs=70)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x22b3ae5cf40>

In [191]:
y_pred=model.predict(test_X)




#### test_y and y_pred are not of same type
#### test_y : binary
#### y_pred: continous

In [192]:
y_predict=[]
for i in y_pred:
    if i>=0.5:
        y_predict.append(1)
    else:
        y_predict.append(0)

In [193]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(test_y,y_predict)
print('Model accuracy is: ',accuracy*100,'%')

Model accuracy is:  60.03937007874016 %


### Saving the model

In [246]:
model.save('model.h5')

In [248]:
from keras.models import load_model
model=load_model('model.h5')

### Testing the model 

In [249]:
test_data=pd.read_csv('test.csv')
test_data

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [250]:
test_data=test_data.dropna()
test_data

Unnamed: 0,id,keyword,location,text
15,46,ablaze,London,Birmingham Wholesale Market is ablaze BBC News...
16,47,ablaze,Niall's place | SAF 12 SQUAD |,@sunkxssedharry will you wear shorts for race ...
17,51,ablaze,NIGERIA,#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriag...
18,58,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...
19,60,ablaze,"Los Angeles, Califnordia",PSA: IÛªm splitting my personalities.\n\n?? t...
...,...,...,...,...
3246,10804,wrecked,Love Reiss,@yakubOObs think he deactivated because his no...
3247,10806,wrecked,Seattle Washington,RT CNBC '3 words from Disney CEO Bob Iger wrec...
3248,10807,wrecked,Acey mountain islanddåÇTorontoåÈ,Smackdown tyme this should put me in a good mo...
3249,10816,wrecked,los angeles,@thrillhho jsyk I haven't stopped thinking abt...


In [255]:
test=['What a nice hat?']
x=tokenizer.texts_to_sequences(clean_tweet(test))
padded_sequence=pad_sequences(x,maxlen=100)
print('Padded Sequence: ',padded_sequence)
padded_sequence=padded_sequence.reshape(padded_sequence.shape[0],1,padded_sequence.shape[1])

Padded Sequence:  [[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0 452]]


In [256]:
result=np.argmax(model.predict(np.array(padded_sequence)))
print(result)

0


###### Non-disasterous tweet detected