In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer

# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import accuracy_score
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
import mlflow
import mlflow.tensorflow
import mlflow.sklearn
import warnings
from tensorflow.keras.layers import TextVectorization
import matplotlib.pyplot as plt
from tensorflow.keras import backend as K
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
import fasttext

2024-11-09 17:36:18.623416: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-09 17:36:19.281102: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdirectml.d6f03b303ac3c4f2eeb8ca631688c9757b361310.so
2024-11-09 17:36:19.281154: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdxcore.so
2024-11-09 17:36:19.287287: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libd3d12.so
Dropped Escape call with ulEscapeCode : 0x03007703
Dropped Escape call with ulEscapeCode : 0x03007703
2024-11-09 17:36:19.418654: I tensorflow/c/logging.cc:34] DirectML device enumeration: found 1 compatible adapters.


ModuleNotFoundError: No module named 'fasttext'

In [4]:
# Remove FutureWarning alerts
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialiser tqdm pour pandas
tqdm.pandas()

# Set a random seed
SEED = 314
np.random.seed(SEED)
print("Random seed set to", SEED)

Random seed set to 314


In [5]:
# Check if GPU and CUDA are available
gpu = tf.config.list_physical_devices("GPU")
print("Tensorflow framework: GPU is", "available" if gpu else "NOT AVAILABLE")

Tensorflow framework: GPU is available


**COMMENTS**:
- Chargement des données du parquet en entier
- Ou supprimer cette section et passer le chargement du parquet dans séparation des données - split data
- Import de token_params pour les paramètres de tokenisation

In [6]:
# Load the pickle file containing the columns
with open("../data/processed/columns.pkl", "rb") as f:
    cols = pickle.load(f)

In [7]:
# reorder the columns in cols moving the column after hour column
cols = cols.reindex(["hour", "target", "text", *cols[3:]])

In [8]:
cols

(Index(['hour', 'target', 'text', 'tokenizer with lowercase',
        'tokenizer with lowercase, handle stripping, and length reduction',
        'tokenizer with lowercase and alpha',
        'tokenizer with lowercase, alpha and emoji',
        'tokenizer with lowercase, alpha, and no stop words',
        'tokenizer with lowercase, alpha and emoji, and no stop words'],
       dtype='object'),
 array([2, 0, 1, 3, 4, 5, 6, 7, 8]))

In [9]:
PATH = "../data/processed/df_preprocessed.parquet"
df = pd.read_parquet(
    PATH,
    columns=["text", "target"],
    engine="pyarrow",
    use_nullable_dtypes=False,
)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1596630 entries, 0 to 799999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1596630 non-null  object
 1   target  1596630 non-null  int8  
dtypes: int8(1), object(1)
memory usage: 25.9+ MB


# **Séparation des données**

**COMMENTS**:
- Charger le parquet dans la fonction si possible en fonction de la liste token_params
- Mettre un argument pour la liste des colonnes à charger sinon

In [11]:
def split_data(df, test_split=0.2, sampling=True, proportion=0.01, stratify=True):
    """
    Split the data into train and test sets
    :param test_split: Proportion of the data to include in the test split
    :param sampling: Whether to sample the data
    :param proportion: Proportion of the data to sample
    :return: X_train, X_test, y_train, y_test
    """
    # Sample the data if needed to reduce the size
    if sampling:
        df_sample = df.sample(frac=proportion, random_state=SEED)
    else:
        df_sample = df
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        df_sample.iloc[:, 0],
        df_sample.iloc[:, -1],
        test_size=test_split,
        stratify=df_sample.iloc[:, -1] if stratify else None,
        random_state=SEED,
    )
    # Delete the sample dataframe to free up memory
    del df_sample
    # Return the train and test sets
    return X_train, X_test, y_train, y_test

In [12]:
def load_splits_from_parquet(X_train, X_test, y_train, y_test, col_name, path=PATH):
    # Read the parquet file with corpus
    df = pd.read_parquet(
        path,
        columns=[col_name, "target"],
        engine="pyarrow",
        use_nullable_dtypes=False,
    )
    # Keep track of the indices
    train_index, test_index = X_train.index, X_test.index
    # Align the dataframes and reindex in the same order
    X_train, _ = df.filter(like=col_name).align(X_train, join="inner", axis=0)
    X_train = X_train.reindex(train_index)
    X_test, _ = df.filter(like=col_name).align(X_test, join="inner", axis=0)
    X_test = X_test.reindex(test_index)
    y_train, _ = df.target.align(y_train, join="inner", axis=0)
    y_train = y_train.reindex(train_index)
    y_test, _ = df.target.align(y_test, join="inner", axis=0)
    y_test = y_test.reindex(test_index)
    # Return the aligned data with features squeezed to remove the extra dimension if necessary
    return X_train.squeeze(), X_test.squeeze(), y_train, y_test

In [13]:
def to_tensorflow_dataset(X_train, X_test, y_train, y_test, col_name, path, validation_split=0.2, batch_size=32):
    # Align the splits with the corpus directly from dataframe
    X_train, X_test, y_train, y_test = load_splits_from_parquet(
        X_train,
        X_test,
        y_train,
        y_test,
        col_name=col_name,
        path=path,
    )
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=validation_split, stratify=y_train, random_state=SEED
    )
    # Create the tensorflow datasets for train, val and test
    train_ds = tf.data.Dataset.from_tensor_slices((X_train_split, y_train_split)).batch(batch_size)
    val_ds = tf.data.Dataset.from_tensor_slices((X_val_split, y_val_split)).batch(batch_size)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
    # Return the tensorflow datasets
    return train_ds, val_ds, test_ds

In [14]:
proportion = 0.01
sampling = True
test_split = 0.2

X_train, X_test, y_train, y_test = split_data(
    df,
    test_split=test_split,
    sampling=sampling,
    proportion=proportion,
)

In [15]:
# Check the size of X_train and X_test
X_train.shape, X_test.shape

((12772,), (3194,))

**COMMENTS**:
- Création du pipeline modulable
- Grille de paramètres pour le vectorizer
- Grille de paramètres pour les modèles (LG, MNB)

## **TF-IDF**

In [15]:
experiment = False
if experiment:
    with mlflow.start_run():
        # Load the file
        # Fit and transform the tf-idf vectorizer on the text column
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=1000,
            strip_accents="unicode",
        )
        X_embed = tfidf.fit_transform(X_train["text"].str.lower())
        # Initialize the model
        model = LogisticRegression(max_iter=1000)
        # Add the input example
        input_logit = X_embed[0]
        input_tfidf = X_train["text"].str.lower().values[0]
        # Perform a cross-validation
        scores = cross_validate(
            model, X_embed, y_train, cv=5, scoring=["accuracy", "f1"]
        )
        # Your training code here...
        model.fit(X_embed, y_train)
        y_pred = model.predict(tfidf.transform(X_test["text"].str.lower()))
        scores = pd.DataFrame(scores).mean()
        acc_score = accuracy_score(y_test, y_pred)
        for metric in scores.keys():
            mlflow.log_metric(f"val_{metric}", scores[metric])
        mlflow.log_metric("test_accuracy", acc_score)
        mlflow.log_param("Dimension", X_embed.shape[1])
        mlflow.log_param("Tf-Idf params", tfidf.get_params())
        mlflow.log_param("Logistic Regression params", model.get_params())
        mlflow.sklearn.log_model(
            model, "logistic_regression", input_example=input_logit
        )
        mlflow.sklearn.log_model(
            tfidf, "tfidf_vectorizer", input_example=input_tfidf, signature=False
        )

In [16]:
from sklearn.pipeline import Pipeline

In [17]:
def cross_score(X_train, y_train, model, col, cv=10):
    """
    Perform a cross-validation on the model
    :param X_train: Training data
    :param y_train: Training target
    :param model: Model to train
    :param cv: Number of folds
    :return: scores
    """
    # Perform a cross-validation
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=["accuracy", "f1"])
    # Return the mean scores
    return pd.DataFrame(scores).mean().rename(col)

In [18]:
# create a pipeline with Tf-Idf and Logistic Regression
model = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                ngram_range=(1, 3),
                max_features=1000,
                strip_accents="unicode",
            ),
        ),
        ("logistic", LogisticRegression(max_iter=1000)),
    ]
)

In [19]:
scores_list = []
for col in tqdm(cols[0][2:]):
    # Align the data
    if col != "text":
        X_train, X_test, y_train, y_test = splitter.align_splits_from_df(
            X_train, X_test, y_train, y_test, col
        )
    # Execute the cross-score function
    scores = cross_score(X_train, y_train, model, col, cv=10)
    # Display the scores
    scores_list.append(scores)
# Concatenate the scores
scores_df = pd.concat(scores_list, axis=1)
# Display the scores
scores_df

  0%|          | 0/7 [00:01<?, ?it/s]


KeyboardInterrupt: 

Le retrait de certain stop words affecte probablement la capacité du modèle à prédire correctement le sentiment.<br>
Par exemple, le mot "not" est un stop word et est retiré de la liste des mots à analyser. Cela peut affecter la prédiction du sentiment si le mot "not" est important pour la prédiction du sentiment.

In [None]:
# create a pipeline with Tf-Idf and Logistic Regression
model = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                ngram_range=(1, 3),
                max_features=1000,
                strip_accents="unicode",
            ),
        ),
        ("MultinomialNB", MultinomialNB()),
    ]
)

In [None]:
scores_list = []
for col in tqdm(cols[0][2:]):
    # Align the data
    X_train, X_test, y_train, y_test = splitter.align_splits_from_df(
        X_train, X_test, y_train, y_test, col
    )
    # Execute the cross-score function
    scores = cross_score(X_train, y_train, model, col, cv=10)
    # Display the scores
    scores_list.append(scores)
# Concatenate the scores
scores_df = pd.concat(scores_list, axis=1)
# Display the scores
scores_df

100%|██████████| 7/7 [02:45<00:00, 23.68s/it]


Unnamed: 0,text,tokenizer with lowercase,"tokenizer with lowercase, handle stripping, and length reduction",tokenizer with lowercase and alpha,"tokenizer with lowercase, alpha and emoji","tokenizer with lowercase, alpha, and no stop words","tokenizer with lowercase, alpha and emoji, and no stop words"
fit_time,2.724755,2.84383,2.538568,2.455027,2.245535,1.202892,1.473164
score_time,0.096113,0.09904,0.090046,0.083166,0.083293,0.051234,0.049753
test_accuracy,0.738229,0.739121,0.738229,0.737368,0.737368,0.719173,0.719173
test_f1,0.728693,0.730365,0.728495,0.730633,0.730633,0.710744,0.710744


## **CUSTOM NN**

In [16]:
# create a custom tensorflow text standardization with lowercase, punctuation removal and tokenization
def custom_standardization(tensor):
    tensor = tf.strings.lower(tensor)  # lowercase
    tensor = tf.strings.regex_replace(tensor, r"@\w+", " ")  # strip mentions
    tensor = tf.strings.regex_replace(tensor, r"http\S+|www\S+", " ")  # strip urls
    tensor = tf.strings.regex_replace(tensor, r"[^\w\s\d]", " ")  # strip punctuation
    tensor = tf.strings.regex_replace(tensor, r"\s{2,}", " ")  # strip multiple spaces
    return tf.strings.strip(tensor)  # strip leading and trailing spaces

In [17]:
URI = "http://localhost:5000"

In [18]:
def prepare_tf_dataset(col_name="text", val_split=0.2, batch_size=32):
# Make the tf splits
    train_ds, val_ds, test_ds = to_tensorflow_dataset(
        X_train,
        X_test,
        y_train,
        y_test,
        col_name=col_name,
        path=PATH,
        validation_split=val_split,
        batch_size=batch_size,
    )

    AUTOTUNE = tf.data.AUTOTUNE
    train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
    return train_ds, val_ds, test_ds

In [27]:
def create_embedding_matrix(vocab, pretrained_weights, random_weights):
    embedding_dim = pretrained_weights.vector_size
    word_index = dict(zip(vocab, range(len(vocab))))
    # Initialiser la matrice d'embeddings
    match random_weights:
        case 'normal':
            embedding_matrix = np.random.normal(scale=0.6, size=(len(vocab), embedding_dim))
        case 'uniform':
            embedding_matrix = np.random.uniform(0, 1, size=(len(vocab), embedding_dim))
        case _:
            embedding_matrix = np.random.normal(scale=0.6, size=(len(vocab), embedding_dim))
            print("Uniquement les valeurs normal ou uniform sont acceptées. Les poids ont été initialisés avec la méthode normale.")
    # Remplir la matrice d'embeddings avec les mots trouvés dans la vectorize layer et laisse les autres poids 
    for word, i in word_index.items():
        if i < len(word_index):
            try:
                embedding_vector = pretrained_weights[word]
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
            except KeyError:
                pass
    return embedding_matrix, embedding_dim

In [28]:
def create_tf_model(max_tokens, seq_length, embedding_dim, additionnal_layers, pretrained_weights=None, random_weights='uniform'):
    # Create the text vectorization layer
    vectorize_layer = TextVectorization(
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=seq_length,
        standardize=custom_standardization,
        trainable=False,
        name="vectorization",
    )

    # Adapt the text vectorization layer to the train dataset
    vectorize_layer.adapt(train_ds.map(lambda text, label: text))
    # Get the vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab_size = len(vocab)
    print("Vocabulary size: ", vocab_size)
    # Create the embedding layer
    if pretrained_weights is not None:
        embedding_matrix, embedding_dim = create_embedding_matrix(vocab, pretrained_weights, random_weights)
    # Create the embedding layer
    model = tf.keras.Sequential([
        vectorize_layer,
        tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            weights=[embedding_matrix] if pretrained_weights is not None else None,
            input_length=seq_length,
            embeddings_initializer=random_weights,
            trainable=True,
            name="embedding",
            ),
        *additionnal_layers,
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ])

    # Compile the model
    model.compile(
            loss=tf.losses.BinaryCrossentropy(),
            optimizer="adam",
            metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)],
        )
    
    return model

In [29]:
# define the name of your experiment
experiment = 'neural_network_scratch_embedding'

# Set the tracking URI
mlflow.set_tracking_uri(URI)
    # try to connect to the server
try:
    mlflow.tracking.get_tracking_uri()
except Exception as e:
    print(f"Cannot connect to the server : {URI}. Check the server status.")
    raise e
# Set, and create if necessary, the experiment
try:
    mlflow.create_experiment(experiment)
except:
    pass
mlflow.set_experiment(experiment)

<Experiment: artifact_location='mlflow-artifacts:/951305245308831132', creation_time=1731164356530, experiment_id='951305245308831132', last_update_time=1731164356530, lifecycle_stage='active', name='neural_network_scratch_embedding', tags={}>

In [30]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = 1000
seq_length = 100
embedding_dim = 16
embedding_trainable = True
epochs = 3
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D", "GlobalMaxPooling1D")

In [31]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers)
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-09 17:27:45.711055: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  1000
Epoch 1/3
  7/320 [..............................] - ETA: 2s - loss: 0.6936 - binary_accuracy: 0.5089  

2024-11-09 17:27:46.667590: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:27:46.713638: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:27:46.713678: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:27:46.716256: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:27:46.716287: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:27:48.994936: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:27:49.011558: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:27:49.011603: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:27:54.325675: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmp8cc6q6d_/model/data/model/assets




Evaluate on test data


2024/11/09 17:27:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalAveragePooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/73f80927f1ae4117ab353e77f30de269.
2024/11/09 17:27:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.
2024-11-09 17:27:59.591506: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  1000
Epoch 1/3
 14/320 [>.............................] - ETA: 2s - loss: 0.6926 - binary_accuracy: 0.4888 

2024-11-09 17:28:00.421205: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:28:00.462793: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:28:00.462840: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:28:00.464422: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:28:00.464454: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:28:02.647433: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:28:02.664562: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:28:02.664606: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:28:07.776313: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpyt5pdnh4/model/data/model/assets




Evaluate on test data


2024/11/09 17:28:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalMaxPooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/63957ca1dc064e2a9b4bd32b7b6dabd0.
2024/11/09 17:28:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.


In [32]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = None
seq_length = 100
embedding_dim = 16
embedding_trainable = True
epochs = 3
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D with max tokens up to vocab size", "GlobalMaxPooling1D with max tokens up to vocab size")

In [33]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers)
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-09 17:28:59.075788: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  13729
Epoch 1/3
  1/320 [..............................] - ETA: 1:48 - loss: 0.6950 - binary_accuracy: 0.4375

2024-11-09 17:28:59.928870: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:28:59.971806: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:28:59.971855: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:28:59.973167: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:28:59.973206: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:29:07.570870: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:29:07.587736: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:29:07.587782: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:29:26.333448: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmptsb1qdda/model/data/model/assets




Evaluate on test data


2024/11/09 17:29:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalAveragePooling1D with max tokens up to vocab size at: http://localhost:5000/#/experiments/951305245308831132/runs/9c477df74ee6490094ef19ef2b20d38d.
2024/11/09 17:29:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.
2024-11-09 17:29:29.714557: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  13729
Epoch 1/3


2024-11-09 17:29:30.540994: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:29:30.582925: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:29:30.582963: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:29:30.584100: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:29:30.584131: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:29:38.233578: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:29:38.250506: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:29:38.250541: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:29:53.784869: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpjbuo7wla/model/data/model/assets




Evaluate on test data


2024/11/09 17:29:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalMaxPooling1D with max tokens up to vocab size at: http://localhost:5000/#/experiments/951305245308831132/runs/0aab36560f8e41c281fd1e9a986a9799.
2024/11/09 17:29:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.


In [34]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = 5000
seq_length = 100
embedding_dim = 16
embedding_trainable = True
epochs = 3
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D", "GlobalMaxPooling1D")

In [36]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers)
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-09 17:30:26.817384: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3
 11/320 [>.............................] - ETA: 3s - loss: 0.6933 - binary_accuracy: 0.4972

2024-11-09 17:30:27.688117: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:30:27.731275: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:27.731323: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:30:27.732863: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:27.732895: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:30:31.361600: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:30:31.378384: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:31.378421: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:30:42.458121: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpdc2n8q85/model/data/model/assets




Evaluate on test data


2024/11/09 17:30:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalAveragePooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/2e98affd63eb49b08dc85d91082eb816.
2024/11/09 17:30:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.
2024-11-09 17:30:45.843025: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3
 11/320 [>.............................] - ETA: 3s - loss: 0.6938 - binary_accuracy: 0.4943

2024-11-09 17:30:46.683277: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:30:46.725649: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:46.725690: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:30:46.726960: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:46.726991: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:30:50.986825: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:30:51.003672: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:51.003715: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:30:59.112651: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpohs4si10/model/data/model/assets




Evaluate on test data


2024/11/09 17:31:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalMaxPooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/f25d1e2563a34c78b0ad86e5a0cb609b.
2024/11/09 17:31:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.


## **GLOVE EMBEDDINGS**

In [38]:
# define the name of your experiment
experiment = 'neural_network_scratch_embedding'

# Set the tracking URI
mlflow.set_tracking_uri(URI)
    # try to connect to the server
try:
    mlflow.tracking.get_tracking_uri()
except Exception as e:
    print(f"Cannot connect to the server : {URI}. Check the server status.")
    raise e
# Set, and create if necessary, the experiment
try:
    mlflow.create_experiment(experiment)
except:
    pass
mlflow.set_experiment(experiment)

<Experiment: artifact_location='mlflow-artifacts:/951305245308831132', creation_time=1731164356530, experiment_id='951305245308831132', last_update_time=1731164356530, lifecycle_stage='active', name='neural_network_scratch_embedding', tags={}>

In [40]:
# Load the glove-twitter-100 model
repo_id = "fse/glove-twitter-100"
model_file = hf_hub_download(repo_id=repo_id, filename="glove-twitter-100.model")
vector_file = hf_hub_download(
    repo_id=repo_id, filename="glove-twitter-100.model.vectors.npy"
)
glove = KeyedVectors.load(model_file, mmap="r")

In [41]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = 5000
seq_length = 100
embedding_dim = 16
embedding_trainable = False
epochs = 3
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D", "GlobalMaxPooling1D")

In [42]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers, pretrained_weights=glove)
    # set the embedding layer trainable or not
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-09 17:32:09.354785: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3
  3/320 [..............................] - ETA: 13s - loss: 0.8653 - binary_accuracy: 0.4687 

2024-11-09 17:32:10.498941: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:32:10.543432: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:32:10.543473: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:32:10.544784: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:32:10.544815: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:32:26.786013: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:32:26.803675: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:32:26.803723: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:33:00.053878: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpfkal15b2/model/data/model/assets




Evaluate on test data


2024/11/09 17:33:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalAveragePooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/0b8318ce382349018cb2079fac4f7681.
2024/11/09 17:33:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.
2024-11-09 17:33:03.445239: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3
  3/320 [..............................] - ETA: 12s - loss: 0.9958 - binary_accuracy: 0.5312 

2024-11-09 17:33:04.315455: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:33:04.360632: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:33:04.360679: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:33:04.361854: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:33:04.361886: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:33:17.562021: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:33:17.578892: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:33:17.578938: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3

: 

In [None]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = 5000
seq_length = 100
embedding_dim = 16
embedding_trainable = True
epochs = 3
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D", "GlobalMaxPooling1D")

In [None]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers, pretrained_weights=glove)
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-09 16:55:43.967624: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  1000
Epoch 1/3
 11/320 [>.............................] - ETA: 3s - loss: 0.7257 - binary_accuracy: 0.4972

2024-11-09 16:55:45.084136: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 16:55:45.133485: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 16:55:45.133528: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 16:55:45.134877: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 16:55:45.134910: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 16:55:48.802232: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 16:55:48.819865: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 16:55:48.819908: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 16:55:59.869873: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmp3kbr91p3/model/data/model/assets




Evaluate on test data


2024/11/09 16:56:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalAveragePooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/ddefa0f6302040e1b317999eb90669a5.
2024/11/09 16:56:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.
2024-11-09 16:56:03.244618: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  1000
Epoch 1/3
 11/320 [>.............................] - ETA: 3s - loss: 0.6937 - binary_accuracy: 0.5170

2024-11-09 16:56:04.079820: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 16:56:04.124221: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 16:56:04.124262: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 16:56:04.125386: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 16:56:04.125417: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 16:56:07.796411: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 16:56:07.813443: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 16:56:07.813477: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 16:56:26.519799: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmp4yh_3x3x/model/data/model/assets




Evaluate on test data


2024/11/09 16:56:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalMaxPooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/cb952c0c63944d219ee441ec9a646dcf.
2024/11/09 16:56:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.


## **FASTTEXT EMBEDDINGS + CUSTOM NN**

In [None]:
# Load fasttext embeddings trained on twitter data
model_path = hf_hub_download(
    repo_id="facebook/fasttext-en-vectors", filename="model.bin"
)
model = fasttext.load_model(model_path)

In [None]:
## Tester avec couches CONV1D
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(layers.Conv1D(num_filters, kernel_size, activation))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
model = Sequential()
model.add(Embedding(2500, embed_dim,input_length = X.shape[1], dropout = 0.2))
model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()


In [None]:
embedding_dim = len(model["hello"])
max_features = len(model.words)

In [None]:
print("Embedding dimension:", embedding_dim)
print("Max features:", max_features)

Embedding dimension: 300
Max features: 2000000


In [None]:
embedding_matrix = np.zeros((max_features, embedding_dim))
embedding_matrix.shape

(2000000, 300)

In [None]:
for i, word in enumerate(model.words):
    embedding_matrix[i] = model[word]

In [None]:
col_name = "text"
train_ds, val_ds, test_ds = tf_ds_split(
    X_train,
    X_test,
    y_train,
    y_test,
    col_name,
)

2024-11-01 19:27:20.200159: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-01 19:27:20.203769: I tensorflow/c/logging.cc:34] DirectML: creating device on adapter 0 (AMD Radeon RX 6700 XT)
Dropped Escape call with ulEscapeCode : 0x03007703
2024-11-01 19:27:20.700907: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-01 19:27:20.701647: W tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc:28] Overriding allow_growth setting because force_memory_growth was requested by the device.
2024-11-01 19:27:20.702266: I tensorflow/core/c

In [None]:
for row in train_ds.take(1):
    print(row)

(<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b"I love asparagus. I just thouht I'd express my vegetable love seeing as Dad's watching about it on The One Show ",
       b'arrrrhhh, i did it again i past 100 and now 200 i was gunna say it was my 200th update but i didnt get to again ',
       b"Losing one's phone in the house while it's set to &quot;silent&quot; is not the best. ",
       b'how do you deal with a long distance relationship help??? ',
       b'BORED of packing ',
       b"@duckierenee I've just read that, you're EVIL and when are you coming to cook for me? ",
       b"@iphone_dev Not having a Mac means I can't use pwnage so  can't get the Egg ",
       b'@wickedgirl24 glad you like it ',
       b'I am feeling like Donald Trump today.  This morning needs to get in line.  ',
       b"@PleaseBiteMe It's totally fair ",
       b"anti-gay protesters dwntwn make mooncheez sad  we support 1st amendmnt rights, but don't agree with the msg. everyone should have rights!",

In [None]:
# Define the parameters
sequence_length = 100
embedding_dim = embedding_dim

# Create a text datagenerator for the training set
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_sequence_length=sequence_length,
    output_mode="int",
)
# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

# Map the vectorize function to the train, val and test datasets
train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)


2024-11-01 19:27:26.075378: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [None]:
with tf.device("/GPU:0"):
    model = tf.keras.Sequential(
        [
            tf.keras.layers.Embedding(
                input_dim=max_features,
                output_dim=embedding_dim,
                weights=[embedding_matrix],
                trainable=False,
                input_length=sequence_length,
            ),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.GlobalAveragePooling1D(),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(1, activation="sigmoid"),
        ]
    )

model.summary()

: 

In [None]:
model.compile(
    loss=tf.losses.BinaryCrossentropy(),
    optimizer="adam",
    metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)],
)

In [None]:
# Define callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", patience=2),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=f"../weights/{col_name}_model.h5",
        monitor="val_loss",
        save_best_only=True,
    ),
]

In [None]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks,
)

Epoch 1/10
  19/3194 [..............................] - ETA: 18s - loss: 0.6929 - binary_accuracy: 0.5329

2024-11-01 19:13:25.634692: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-01 19:13:25.677225: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-01 19:13:25.677281: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)




2024-11-01 19:13:43.173326: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-01 19:13:43.187808: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-01 19:13:43.187853: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-01 19:13:44.955777: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1200000000 exceeds 10% of free system memory.


KeyboardInterrupt: 