# Tuning Hyper Paramètres CNN V1 Numéro 1

### Travail effectué
* CNN Initial
* CNN avec 5 couches
* Best paramètres à la fin

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

from Fonctions_utils import f1_m

In [None]:
df_train = pd.read_csv("../../data_models/df_train_undersampling.csv")
df_val = pd.read_csv("../../data_models/df_val.csv")
df_test = pd.read_csv("../../data_models/df_test.csv")

In [None]:
sentences_train = df_train["text_clean"].values.astype(str)
sentences_val = df_val["text_clean"].values.astype(str)
sentences_test = df_test["text_clean"].values.astype(str)

y_train = df_train["Insult"].values
y_val = df_val["Insult"].values
y_test = df_test["Insult"].values

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_val = tokenizer.texts_to_sequences(sentences_val)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1

The indexing is ordered after the most common words in the text, which you can see by the word the having the index 1. It is important to note that the index 0 is reserved and is not assigned to any word. This zero index is used for padding, which I’ll introduce in a moment

One problem that we have is that each text sequence has in most cases different length of words. To counter this, you can use pad_sequence() which simply pads the sequence of words with zeros. By default, it prepends zeros but we want to append them. Typically it does not matter whether you prepend or append zeros.

In [None]:
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=[f1_m,'accuracy'])
    return model

In [None]:
epochs = 30

param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7, 9],
                  vocab_size=[vocab_size], 
                  embedding_dim=[50],
                  maxlen=[maxlen])

In [None]:
model = KerasClassifier(build_fn=create_model, epochs=epochs, batch_size=10, verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=2, n_iter=4)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
val_accuracy = grid.score(X_val, y_val)
test_accuracy = grid.score(X_test, y_test)

In [None]:
print("Best score : ",grid_result.best_score_)
print("Best params", grid_result.best_params_)
print("Validation Accuracy : ",val_accuracy)
print("Test Accuracy : ",test_accuracy)

## Résultat :
* Best score :  0.6386612057685852
* Best params {'vocab_size': 6326, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 9, 'embedding_dim': 50}
* Validation Accuracy :  0.76371306
* Test Accuracy :  0.7548168