In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Import Tensorflow & Pathlib librairies
import pandas as pd 
import numpy as np
import os

from sklearn.model_selection import train_test_split

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalAveragePooling1D

import plotly.express as px

In [2]:
# Import Dataset
df= pd.read_csv('../input/nlp-getting-started/train.csv')

In [3]:
# Diplay informations for the train set
display(df.head())
print()
print(df.info())
print()
display(df.describe(include='all'))

In [None]:
# Diplay informations for the test set
display(df_test.head())
print()
print(df_test.info())
print()
display(df_test.describe(include='all'))

#                                 EDA

In [14]:
fig = px.histogram(df, 
                   y='keyword', 
                   color = 'target',
                   title="Keywords count for the target"
                  )
fig.update_layout(
    autosize=False,
    width=800,
    height=1500,
    margin=dict(l=50,r=50,b=50,t=50,pad=4),
    paper_bgcolor="LightSteelBlue",
    barmode="relative",
    yaxis={'categoryorder':'total ascending'}
)
fig.update_yaxes(tickfont_size=10)
fig.show()

In [13]:
fig2 = px.histogram(df, 
                   x='target',
                   title="Target count in the training set"
                  )
fig2.update_layout(
    autosize=False,
    width=500,
    height=500,
    margin=dict(l=50,r=50,b=50,t=50,pad=4),
    paper_bgcolor="LightSteelBlue"
)
fig2.show()       

#                             PREPROCESSING

In [15]:
# Import the english language model
nlp = spacy.load("en_core_web_sm")
# Delete non int or spaces
df["text_clean"] = df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
# Lower string and strip
df["text_clean"] = df["text_clean"].apply(lambda x: x.lower().strip())
# delete urls
df["text_clean"] = df['text_clean'].str.replace(r"http.*","")
# Lematization
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))

df.head()

In [16]:
# Check if there is any non-string in text_clean
df.text_clean.value_counts()

In [17]:
# Clean it
mask = df.text_clean.apply(lambda x: type(x)==str)
# Check it
mask.value_counts()

In [19]:
# Filter dataset
df = df.loc[mask,:]

In [20]:
# instanciate the tokenizer and set it up to keep only the 1000 most common words
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000)
# Fit on text_clean
tokenizer.fit_on_texts(df.text_clean)
# Create new column
df["txt_encoded"] = tokenizer.texts_to_sequences(df.text_clean)
df.head()

In [21]:
# Store encoded texts into single numpy array of same length by adding zero padding at the end
text_pad = tf.keras.preprocessing.sequence.pad_sequences(df.txt_encoded, padding="post")

In [23]:
# Train Test Split
xtrain, xval, ytrain, yval = train_test_split(text_pad,df.target, test_size=0.3)

In [24]:
# create the tensor dataset for the training, and validation set
train = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
val = tf.data.Dataset.from_tensor_slices((xval, yval))

In [25]:
# Shuffle data and create batch on both set
train_batch = train.shuffle(len(train)).batch(64)
val_batch = val.shuffle(len(val)).batch(64)

In [26]:
# Look at a batch of data
for tweet, meaning in train_batch.take(1):
  print(tweet, meaning)

#                         MODELING

In [27]:
vocab_size = tokenizer.num_words
vocab_size

In [28]:
[df.shape]

In [98]:
# Build the model
model = tf.keras.Sequential([Embedding(vocab_size+1, 64, input_shape=[df.shape[1]], name="embedding"),
                             GlobalAveragePooling1D(),
                             Dense(32,activation="relu"),
                             Dense(16, activation="relu"),
                             Dense(1, activation="sigmoid")
                            ])
model.summary()

In [None]:
# Ce modèle donne avec opti (0.00005) + L2(0.001): 
# loss: 0.4019 - binary_accuracy: 0.8499 - val_loss: 0.5406 - val_binary_accuracy: 0.7776
model = tf.keras.Sequential([Embedding(vocab_size+1, 64, input_shape=[df.shape[1]], name="embedding"),
                             GlobalAveragePooling1D(),
                             Dense(32,activation="relu",kernel_regularizer=l2(0.001)),
                             Dense(16, activation="relu",kernel_regularizer=l2(0.001)),
                             Dense(1, activation="sigmoid")
                            ])
model.summary()

In [99]:
# Set the optimizer 
optimizer= tf.keras.optimizers.Adam(0.00005)

# Compile model with the optimizer, the loss fonction and the metrics
# Binary Cross Entropy is the negative average of the log of corrected predicted probabilities.
# Cross-entropy will calculate a score that summarizes the average difference between 
# the actual and predicted probability distributions for predicting class 1. 
# The score is minimized and a perfect cross-entropy value is 0.
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()]
             )

In [100]:
history = model.fit(train_batch, 
                    epochs=100, 
                    validation_data=val_batch
                   )

In [101]:
import matplotlib.pyplot as plt

# Visualization of the training process on the loss function 
plt.plot(history.history["loss"], color="b")
plt.plot(history.history["val_loss"], color="r")
plt.ylabel("loss")
plt.xlabel("Epochs")
plt.show()

In [102]:
plt.plot(history.history["binary_accuracy"], color="b")
plt.plot(history.history["val_binary_accuracy"], color="r")
plt.ylabel("acurracy")
plt.xlabel("Epochs")
plt.show()

# learning rate (0.00005) with Adam optimizer :
# 55 epochs: 
loss: 0.3928 - binary_accuracy: 0.8364 - val_loss: 0.4609 - val_binary_accuracy: 0.7960
> Test new model with dropout layers and more dense layer

In [92]:
# Ce modèle donne avec opti (0.00005): 
# 50 epochs: loss: 0.4260 - binary_accuracy: 0.8296 - val_loss: 0.4655 - val_binary_accuracy: 0.7938
model2 = tf.keras.Sequential([Embedding(vocab_size+1, 64, input_shape=[df.shape[1]], name="embedding"),
                             GlobalAveragePooling1D(),
                             Dense(32,activation="relu"),
                             Dropout(0.2),
                             Dense(16, activation="relu"),
                             Dropout(0.2),
                             Dense(8, activation="relu"),
                             Dropout(0.2),
                             Dense(1, activation="sigmoid")
                            ])
model2.summary()

In [94]:
model2.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()]
             )

In [95]:
history2 = model2.fit(train_batch, 
                    epochs=100, 
                    validation_data=val_batch
                   )

In [96]:
plt.plot(history2.history["loss"], color="b")
plt.plot(history2.history["val_loss"], color="r")
plt.ylabel("loss")
plt.xlabel("Epochs")
plt.show()

In [97]:
plt.plot(history2.history["binary_accuracy"], color="b")
plt.plot(history2.history["val_binary_accuracy"], color="r")
plt.ylabel("acurracy")
plt.xlabel("Epochs")
plt.show()

# learning rate (0.00005) with Adam optimizer :
# 39 epochs: 
loss: 0.4331 - binary_accuracy: 0.8283 - val_loss: 0.4596 - val_binary_accuracy: 0.7929

> Second model is equivalent to the first