In [162]:
import pandas as pd
import numpy as np

import string
import re

import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import optimizers
from tensorflow.keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuanz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yuanz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yuanz\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [191]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

(3263, 4)


In [164]:
train_data.iloc[0:100]

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
95,137,accident,Charlotte,9 Mile backup on I-77 South...accident blockin...,1
96,138,accident,"Baton Rouge, LA",Has an accident changed your life? We will hel...,0
97,139,accident,"Hagerstown, MD",#BREAKING: there was a deadly motorcycle car a...,1
98,141,accident,"Gloucestershire , UK",@flowri were you marinading it or was it an ac...,0


In [165]:
def getCleanText(text):
    text = [c for c in text if c not in string.punctuation]
    text = ''.join(text)
    return text

In [166]:
train_data["clean"] = train_data["text"].apply(getCleanText)

In [167]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


In [168]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'EMOJI', text)

def remove_not_ASCII(text):
    text = ''.join([word for word in text if word in string.printable])
    return text

def transcription_sad(text):
    eyes = "[8:=;]"
    nose = "['`\-]"
    smiley = re.compile(r'[8:=;][\'\-]?[(\\/]')
    return smiley.sub(r'SADFACE', text)

def transcription_smile(text):
    eyes = "[8:=;]"
    nose = "['`\-]"
    smiley = re.compile(r'[8:=;][\'\-]?[)dDp]')
    return smiley.sub(r'SMILE', text)

def transcription_heart(text):
    heart = re.compile(r'<3')
    return heart.sub(r'HEART', text)

In [169]:
def clean_tweet(text):
    text = remove_not_ASCII(text)
    text = remove_emoji(text)
    text = transcription_sad(text)
    text = transcription_smile(text)
    text = transcription_heart(text)
  
    return text

In [170]:
train_data["clean"] = train_data["clean"].apply(clean_tweet)

In [171]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...


In [172]:
def getTokens(text):
    tokenizer = RegexpTokenizer(r'[^\W_]+')
    words = tokenizer.tokenize(text)

    words = [word for word in words if word not in stopwords.words('english')]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.lower()) for word in words]
    
    return words

In [173]:
train_data["clean"] = train_data["clean"].apply(getTokens)

In [174]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[our, deed, reason, earthquake, may, allah, fo..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[all, resident, asked, shelter, place, notifie..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfire, evacuation,..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, sent, photo, ruby, alaska, smoke, ..."


In [175]:
numWords = 3000
tokenizer=Tokenizer(num_words=numWords,split=' ')
tokenizer.fit_on_texts(train_data['clean'].values)
X = tokenizer.texts_to_sequences(train_data['clean'].values)
X = pad_sequences(X, maxlen=35)

In [176]:
X[:100], X.shape

(array([[   0,    0,    0, ...,   85, 1507,    9],
        [   0,    0,    0, ...,  180,  554, 1155],
        [   0,    0,    0, ...,  506,  411, 1035],
        ...,
        [   0,    0,    0, ...,  334, 2023,  132],
        [   0,    0,    0, ...,    0,    0,   75],
        [   0,    0,    0, ...,   57,  255,  770]]),
 (7613, 35))

In [177]:
y = train_data['target']

In [178]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [179]:
X_train.shape

(6090, 35)

In [180]:
model = keras.Sequential([
    keras.layers.Embedding(input_dim=3000, output_dim=32, input_length=35),
    keras.layers.Dropout(0.2), 
    keras.layers.LSTM(64,
                    dropout=0.2, 
                    return_sequences=True, 
                    recurrent_initializer='glorot_uniform'), 
    keras.layers.Dropout(0.2),
    keras.layers.LSTM(64,
                    dropout=0.2, 
                    recurrent_initializer='glorot_uniform'), 
    keras.layers.Dropout(0.2), 
    keras.layers.Dense(1, activation='sigmoid')
    ])

In [181]:
model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 35, 32)            96000     
                                                                 
 dropout_25 (Dropout)        (None, 35, 32)            0         
                                                                 
 lstm_21 (LSTM)              (None, 35, 64)            24832     
                                                                 
 dropout_26 (Dropout)        (None, 35, 64)            0         
                                                                 
 lstm_22 (LSTM)              (None, 64)                33024     
                                                                 
 dropout_27 (Dropout)        (None, 64)                0         
                                                                 
 dense_18 (Dense)            (None, 1)               

In [182]:
adam = optimizers.Adam(learning_rate=0.002)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [183]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [198]:
test_data["clean"] = test_data["text"].apply(getCleanText)
test_data["clean"] = test_data["clean"].apply(clean_tweet)
test_data["clean"] = test_data["clean"].apply(getTokens)

In [199]:
test_data.head()

Unnamed: 0,id,keyword,location,text,clean
0,0,,,Just happened a terrible car crash,"[just, happened, terrible, car, crash]"
1,2,,,"Heard about #earthquake is different cities, s...","[heard, earthquake, different, city, stay, saf..."
2,3,,,"there is a forest fire at spot pond, geese are...","[forest, fire, spot, pond, goose, fleeing, acr..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,"[apocalypse, lighting, spokane, wildfire]"
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,"[typhoon, soudelor, kill, 28, china, taiwan]"


In [202]:
numWords = 3000
tokenizer=Tokenizer(num_words=numWords,split=' ')
tokenizer.fit_on_texts(test_data['clean'].values)
test = tokenizer.texts_to_sequences(test_data['clean'].values)
test = pad_sequences(test, maxlen=35)

In [207]:
prediction = model.predict(test).round()
submission = pd.read_csv("./sample_submission.csv")
submission['target'] = np.round(prediction).astype('int')
submission.to_csv('submission.csv', index=False)
submission.describe()



Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.391358
std,3146.427221,0.488129
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,1.0
max,10875.0,1.0
