# Convolutional Neural Network with own Embedding and fined tuning

### Travail effectué
* xx

### Conclusion
* xxx
* xxx

In [1]:
import pandas as pd
from matplotlib import pyplot as plt

from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [2]:
df_train = pd.read_csv("../../data_models/df_train_undersampling.csv")
df_val = pd.read_csv("../../data_models/df_val.csv")
df_test = pd.read_csv("../../data_models/df_test.csv")

In [3]:
sentences_train = df_train["text_clean"].values.astype(str)
sentences_val = df_val["text_clean"].values.astype(str)
sentences_test = df_test["text_clean"].values.astype(str)

y_train = df_train["Insult"].values
y_val = df_val["Insult"].values
y_test = df_test["Insult"].values

In [4]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_val = tokenizer.texts_to_sequences(sentences_val)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1

The indexing is ordered after the most common words in the text, which you can see by the word the having the index 1. It is important to note that the index 0 is reserved and is not assigned to any word. This zero index is used for padding, which I’ll introduce in a moment

One problem that we have is that each text sequence has in most cases different length of words. To counter this, you can use pad_sequence() which simply pads the sequence of words with zeros. By default, it prepends zeros but we want to append them. Typically it does not matter whether you prepend or append zeros.

In [5]:
maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [6]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [7]:
epochs = 20

In [8]:
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7, 9],
                  vocab_size=[vocab_size], 
                  embedding_dim=[50],
                  maxlen=[maxlen])

In [9]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

In [10]:
model = KerasClassifier(build_fn=create_model, epochs=epochs, batch_size=10, verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=2, n_iter=4)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
val_accuracy = grid.score(X_val, y_val)
test_accuracy = grid.score(X_test, y_test)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


2023-06-06 10:45:11.600055: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-06 10:45:11.601765: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


[CV] END embedding_dim=50, kernel_size=7, maxlen=200, num_filters=32, vocab_size=6326; total time=  36.2s
[CV] END embedding_dim=50, kernel_size=7, maxlen=200, num_filters=32, vocab_size=6326; total time=  36.3s
[CV] END embedding_dim=50, kernel_size=7, maxlen=200, num_filters=32, vocab_size=6326; total time=  41.0s
[CV] END embedding_dim=50, kernel_size=7, maxlen=200, num_filters=32, vocab_size=6326; total time=  39.7s
[CV] END embedding_dim=50, kernel_size=7, maxlen=200, num_filters=128, vocab_size=6326; total time=  55.9s
[CV] END embedding_dim=50, kernel_size=7, maxlen=200, num_filters=128, vocab_size=6326; total time=  58.0s
[CV] END embedding_dim=50, kernel_size=7, maxlen=200, num_filters=128, vocab_size=6326; total time= 1.2min
[CV] END embedding_dim=50, kernel_size=7, maxlen=200, num_filters=128, vocab_size=6326; total time= 1.5min
[CV] END embedding_dim=50, kernel_size=5, maxlen=200, num_filters=32, vocab_size=6326; total time= 1.2min
[CV] END embedding_dim=50, kernel_size=5, 

In [11]:
print("Best score : ",grid_result.best_score_)
print("Best params", grid_result.best_params_)
print("Validation Accuracy : ",val_accuracy)
print("Test Accuracy : ",test_accuracy)

Best score :  0.6810109168291092
Best params {'vocab_size': 6326, 'num_filters': 32, 'maxlen': 200, 'kernel_size': 5, 'embedding_dim': 50}
Validation Accuracy :  0.8109705
Test Accuracy :  0.79977334



Best score :  0.6810109168291092
Best params {'vocab_size': 6326, 'num_filters': 32, 'maxlen': 200, 'kernel_size': 5, 'embedding_dim': 50}
Validation Accuracy :  0.8109705
Test Accuracy :  0.79977334