In [2]:
# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import warnings
import mlflow
import mlflow.tensorflow
import mlflow.keras
import mlflow.sklearn
from tqdm import tqdm
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
from gensim.models import KeyedVectors
from tensorflow.config import list_physical_devices
from utils import (
    split_data,
    load_splits_from_parquet,
    to_tensorflow_dataset,
    create_tf_model,
)

2024-12-29 16:29:36.614368: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-29 16:29:38.085780: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdirectml.d6f03b303ac3c4f2eeb8ca631688c9757b361310.so
2024-12-29 16:29:38.085848: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdxcore.so
2024-12-29 16:29:38.090556: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libd3d12.so
Dropped Escape call with ulEscapeCode : 0x03007703
Dropped Escape call with ulEscapeCode : 0x03007703
2024-12-29 16:29:38.690265: I tensorflow/c/logging.cc:34] DirectML device enumeration: found 1 compatible adapters.


In [3]:
# Clear custom objects in case of re-import
tf.keras.utils.get_custom_objects().clear()

In [4]:
@tf.keras.utils.register_keras_serializable(
    package="custom_text_func", name="custom_standardization"
)
def custom_standardization(tensor):
    tensor = tf.strings.lower(tensor)  # lowercase
    tensor = tf.strings.regex_replace(tensor, r"@\w+", " ")  # strip mentions
    tensor = tf.strings.regex_replace(tensor, r"http\S+|www\S+", " ")  # strip urls
    tensor = tf.strings.regex_replace(tensor, r"[^\w\s\d]", " ")  # strip punctuation
    tensor = tf.strings.regex_replace(tensor, r"\s{2,}", " ")  # strip multiple spaces
    return tf.strings.strip(tensor)  # strip leading and trailing spaces

In [5]:
SEED = 314
# Define the URI of the MLflow server and the name of the experiment
URI = "http://localhost:5000"
PATH_PARQUET = "../data/processed/df_preprocessed.parquet"
PATH_COLS = "../data/processed/columns.pkl"

In [6]:
# Remove FutureWarning alerts
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialiser tqdm pour pandas
tqdm.pandas()

# Set a random seed
SEED = 314
np.random.seed(SEED)
print("Random seed set to", SEED)

# Check if GPU and CUDA are available
gpu = list_physical_devices("GPU")
print("Tensorflow framework: GPU is", "available" if gpu else "NOT AVAILABLE")

Random seed set to 314
Tensorflow framework: GPU is available


In [7]:
# Load the pickle file containing the columns
with open(PATH_COLS, "rb") as f:
    cols = pickle.load(f)

# reorder the columns in cols moving the column after hour column
cols = cols.reindex(["hour", "target", "text", *cols[3:]])

print(cols)

(Index(['hour', 'target', 'text', 'tokenizer with lowercase',
       'tokenizer with lowercase, handle stripping, and length reduction',
       'tokenizer with lowercase and alpha',
       'tokenizer with lowercase, alpha and emoji',
       'tokenizer with lowercase, alpha, and no stop words',
       'tokenizer with lowercase, alpha and emoji, and no stop words'],
      dtype='object'), array([2, 0, 1, 3, 4, 5, 6, 7, 8]))


In [8]:
df = pd.read_parquet(
    PATH_PARQUET,
    columns=["text", "target"],
    engine="pyarrow",
    use_nullable_dtypes=False,
)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1596630 entries, 0 to 799999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1596630 non-null  object
 1   target  1596630 non-null  int8  
dtypes: int8(1), object(1)
memory usage: 25.9+ MB
None


# **Séparation des données**

In [9]:
# Define the parameters for the split
proportion = 0.25
sampling = True
test_split = 0.2

# Split the data
X_train, X_test, y_train, y_test = split_data(
    df,
    test_split=test_split,
    sampling=sampling,
    proportion=proportion,
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(319326,) (79832,) (319326,) (79832,)
