In [10]:
# Basic Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set Style of Viz
sns.set_style("darkgrid")
# sns.set_palette(palette='dark:#5A9_r')

# Magic lines
%matplotlib inline
%load_ext autoreload
%autoreload 2

# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras import layers

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

#sk
from sklearn.model_selection import train_test_split


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# import data
data =pd.read_csv('/Users/mathildedeschaud/code/Madeschaud/911-tweet/Data/clean_data.csv')
data.head()


Unnamed: 0.1,Unnamed: 0,tweet_id,tweet_text,class_label,disaster_type,disaster_year,country,tweet_clean,words_per_tweet,actionable
0,0,1107353602434183169,Maybe armchair critics like @lilomatic can app...,caution_and_advice,cyclone,2019,"Madagascar, Mozambique, Malawi, Zimbabwe",maybe armchair critic like appreciate real sit...,25,0
1,1,1108735335733825536,RT @povozim: Total Zimbabwe on the fuel situat...,other_relevant_information,cyclone,2019,"Madagascar, Mozambique, Malawi, Zimbabwe",total zimbabwe fuel situation country followin...,9,0
2,2,1107658470508191744,.@Emuia briefed Mr Cho Jaichel on the coordina...,sympathy_and_support,cyclone,2019,"Madagascar, Mozambique, Malawi, Zimbabwe",briefed mr cho jaichel coordinated effort agen...,20,0
3,3,1113073066131800064,Disease outbreaks: INGC maps potential risk ar...,caution_and_advice,cyclone,2019,"Madagascar, Mozambique, Malawi, Zimbabwe",disease outbreak ingc map potential risk area ...,9,0
4,4,1108234590068637696,Goodmorning to you all a smile will brighten s...,sympathy_and_support,cyclone,2019,"Madagascar, Mozambique, Malawi, Zimbabwe",goodmorning smile brighten someone day giving ...,19,0


In [12]:
# set params
max_features =10000
max_len=300
embedding_dim=50

In [13]:
#identify X,y
X = data['tweet_clean']
y = data.actionable

#split data
X_train, X_test,y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=42)

In [14]:
# tokenize
tk =Tokenizer()
tk.fit_on_texts(X_train)
vocab_size = len(tk.word_index)
print(f'There are {vocab_size} different words in your corpus')
X_train_token = tk.texts_to_sequences(X_train)

There are 73693 different words in your corpus


In [15]:
# Pad the inputs
X_pad = pad_sequences(X_train_token, dtype='float32', padding='pre')

In [19]:
X_pad


array([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.3000e+01, 1.7400e+02,
        6.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4000e+01, 8.0000e+00,
        2.0563e+04],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0500e+02, 5.6000e+01,
        9.1000e+01],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9320e+03, 4.0300e+02,
        7.3692e+04],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.1000e+03, 1.5200e+02,
        8.2300e+02],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2000e+01, 7.8000e+02,
        6.0600e+02]], dtype=float32)

In [22]:
# build model
model = Sequential()
#embedding
model.add(Embedding(input_dim=vocab_size+1,output_dim=2, mask_zero=True))

#lstm
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

#compile
model.compile(loss='binary_crossentropy', optimizer= 'rmsprop', metrics=['accuracy'])

In [23]:
# Train the model
model.fit(X_pad, y_train, batch_size=128, epochs=5, validation_split=0.2)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')

Epoch 1/5


In [None]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 500, 2)            81134     
                                                                 
 lstm_5 (LSTM)               (None, 500, 64)           17152     
                                                                 
 lstm_6 (LSTM)               (None, 500, 32)           12416     
                                                                 
 lstm_7 (LSTM)               (None, 16)                3136      
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 113855 (444.75 KB)
Trainable params: 113855 (444.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
