> Project summary

# EDA & Preprocessing
 - EDA
 - PREPROCESSING
# Build models
 - Model 1 | 5 layers
 - Model 2 | 5 layers & dropouts

> Project summary

# EDA & Preprocessing
 - EDA
 - PREPROCESSING
# Build models
 - Model 1 | 5 layers
 - Model 2 | 5 layers & dropouts

In [1]:
# Import libraries
import pandas as pd 
import numpy as np
import os

from sklearn.model_selection import train_test_split

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalAveragePooling1D

import plotly.express as px
import matplotlib.pyplot as plt

# --------- EDA & Preprocessing ------------

In [2]:
# Import Dataset
df= pd.read_csv('train.csv')

## ---------EDA------------

In [3]:
# Diplay informations for the train set
display(df.head())
print()
print(df.info())
print()
display(df.describe(include='all'))

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None



Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


In [6]:
fig = px.histogram(df, 
                   y='keyword', 
                   color = 'target',
                   title="Keywords count for the target"
                  )
fig.update_layout(width=800, 
                  title_x = 0.5,
                  height=3000,
                  margin=dict(l=50,r=50,b=50,t=50,pad=4),
                  template = 'plotly_dark',
                  barmode="relative",
                  yaxis={'categoryorder':'total ascending'}
                 )
fig.update_yaxes(tickfont_size=10)
fig.show()

In [7]:
# Create vairable to display target ratio
target_ratio = (df['target'].value_counts(normalize=True)*100).rename_axis('target').reset_index(name='counts')

In [8]:
# Display disater tweet ratio in the train set
fig = px.pie(target_ratio,
             values='counts',
             names='target', 
             width= 1000,
             title='Proportion of disaster tweets'
             )
fig.update_traces(textposition = 'outside', textfont_size =20)             
fig.update_layout(title_x = 0.5, 
                  margin=dict(l=50,r=50,b=50,t=50,pad=4), 
                  template = 'plotly_dark'
                  )    
fig.show()

> The EDA reveal heterogeneity in the dataset

> Lot of keyword does not correspond to a disaster tweet

> The train set is unbalanced for the target

## ----------PREPROCESSING-----------

In [None]:
# Creation of a clean text column 
# Import the english language model
nlp = spacy.load("en_core_web_sm")
# Delete non int or spaces
df["text_clean"] = df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch == " "))
# Lower string and delete start/end spaces
df["text_clean"] = df["text_clean"].apply(lambda x: x.lower().strip())
# delete urls
df["text_clean"] = df['text_clean'].str.replace(r"http.*","")
# Lematization 
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))

In [None]:
# Check if there is any non-string in text_clean
df.text_clean.value_counts()

In [None]:
# Clean it
mask = df.text_clean.apply(lambda x: type(x)==str)
# Check it
mask.value_counts()

In [None]:
# Filter dataset
df = df.loc[mask,:]

In [None]:
# Instanciate the tokenizer and set it up to keep only the 1000 most common words
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = 1000)
# Fit on text_clean
tokenizer.fit_on_texts(df.text_clean)
# Create new column
df["txt_encoded"] = tokenizer.texts_to_sequences(df.text_clean)
df.head()

In [None]:
# Store encoded texts into single numpy array of same length by adding zero padding at the end
text_pad = tf.keras.preprocessing.sequence.pad_sequences(df.txt_encoded, padding = "post")

In [None]:
# Train Test Split
xtrain, xval, ytrain, yval = train_test_split(text_pad,df.target, test_size = 0.3, random_state = 0)

In [None]:
# Create the tensor dataset for the training, and validation set
train = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
val = tf.data.Dataset.from_tensor_slices((xval, yval))

In [None]:
# Shuffle data and create batch on both set
train_batch = train.shuffle(len(train)).batch(64)
val_batch = val.shuffle(len(val)).batch(64)

In [None]:
# Look at a batch of data
for tweet, meaning in train_batch.take(1):
  print(tweet, meaning)

# --------- Build models -----------

In [None]:
# Define input dimension for the embedding layer
vocab_size = tokenizer.num_words + 1
# Set the optimizer 
optimizer= tf.keras.optimizers.Adam(0.0001)
# Define scheduler function for decrease learning rate
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)
# Define callbacks from the scheduler function
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

## Model 1

> 3 Dense layers

In [None]:
# Build the model
model = tf.keras.Sequential([Embedding(vocab_size, 64, input_shape=[df.shape[1]], name="embedding"),
                             GlobalAveragePooling1D(),
                             Dense(32,activation="relu"),
                             Dense(16, activation="relu"),
                             Dense(1, activation="sigmoid")
                            ])
# Display model structure
model.summary()

In [None]:
# Compile model with the optimizer, the loss fonction and the metrics
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()]
             )
''' 
Binary Cross Entropy is the negative average of the log of corrected predicted probabilities.
Cross-entropy will calculate a score that summarizes the average difference between 
the actual and predicted probability distributions for predicting class 1. 
The score is minimized and a perfect cross-entropy value is 0.
'''

In [None]:
# Fit the model to the train set
history = model.fit(train_batch, 
                    epochs=100,
                    callbacks=[callback],
                    validation_data=val_batch
                   )

In [None]:
# Visualization of the training process for the loss
plt.plot(history.history["loss"], color="b")
plt.plot(history.history["val_loss"], color="r")
plt.ylabel("loss")
plt.xlabel("Epochs")
plt.show()

In [None]:
# Visualization of the training process for the accuracy
plt.plot(history.history["binary_accuracy"], color="b")
plt.plot(history.history["val_binary_accuracy"], color="r")
plt.ylabel("acurracy")
plt.xlabel("Epochs")
plt.show()

Scores for 100 epochs: 

> loss: 0.4088 | val_loss: 0.4574 

> binary_accuracy: 0.8264 | val_binary_accuracy: 0.7942

## Model 2 

> 4 Dense Layers

> 3 Dropout Layers

In [None]:
# Build model
model2 = tf.keras.Sequential([Embedding(vocab_size, 64, input_shape=[df.shape[1]], name="embedding"),
                             GlobalAveragePooling1D(),
                             Dense(32,activation="relu"),
                             Dropout(0.2),
                             Dense(16, activation="relu"),
                             Dropout(0.2),
                             Dense(8, activation="relu"),
                             Dropout(0.2),
                             Dense(1, activation="sigmoid")
                            ])
'''During training, some number of layer outputs are randomly ignored or “dropped out.” 
This has the effect of making the layer look-like and be treated-like a layer with a different number of nodes and connectivity to the prior layer. 
In effect, each update to a layer during training is performed with a different “view” of the configured layer.
'''
# Display model structure
model2.summary()

In [None]:
# Compile model with the optimizer, the loss fonction and the metrics
model2.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()]
             )

In [None]:
# Fit the second model to the train set
history2 = model2.fit(train_batch,
                      epochs=100,
                      callbacks=[callback],
                      validation_data=val_batch
                      )

In [None]:
# Visualization of the training process for the loss
plt.plot(history2.history["loss"], color="b")
plt.plot(history2.history["val_loss"], color="r")
plt.ylabel("loss")
plt.xlabel("Epochs")
plt.show()

In [None]:
# Visualization of the training process for the accuracy
plt.plot(history2.history["binary_accuracy"], color="b")
plt.plot(history2.history["val_binary_accuracy"], color="r")
plt.ylabel("acurracy")
plt.xlabel("Epochs")
plt.show()

Scores for 100 epochs: 
> loss: 0.5550 | val_loss: 0.5552

> binary_accuracy: 0.7902  | val_binary_accuracy: 0.7771

Conclusion: 

> The models results are noticeably the same for the accuracy but the second model have more loss

> The First model with fewer layers is better to classify tweets.