In [36]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer

# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import accuracy_score
import pickle
import dagshub
import tensorflow as tf
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import mlflow
import warnings
from tensorflow.keras.layers import TextVectorization

In [4]:
# Remove FutureWarning alerts
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialize dagshub repo
try:
    dagshub.init(repo_owner="hedredo", repo_name="dagshub_p7", mlflow=True)
    mlflow.set_experiment("p7")
except Exception as e:
    print("Dagshub repo can't be initialized:", e)

# Initialiser tqdm pour pandas
tqdm.pandas()

# Set a random seed
SEED = 314
np.random.seed(SEED)
print("Random seed set to", SEED)

Random seed set to 314


In [6]:
# Check if GPU and CUDA are available
gpu = tf.config.list_physical_devices("GPU")
cuda = torch.cuda.is_available()
print("Tensorflow framework: GPU is", "available" if gpu else "NOT AVAILABLE")
print("Pytorch framework: CUDA is", "available" if cuda else "NOT AVAILABLE")

Tensorflow framework: GPU is available
Pytorch framework: CUDA is NOT AVAILABLE


# **Chargement des données préparées**

**COMMENTS**:
- Chargement des données du parquet en entier
- Ou supprimer cette section et passer le chargement du parquet dans séparation des données - split data
- Import de token_params pour les paramètres de tokenisation

In [71]:
# Load the pickle file containing the columns
with open("./data/processed/columns.pkl", "rb") as f:
    cols = pickle.load(f)

In [72]:
path = "./data/processed/df_preprocessed.parquet"
df = pd.read_parquet(
    path,
    columns=["text", "target"],
    engine="pyarrow",
    use_nullable_dtypes=False,
)

# **Séparation des données**

**COMMENTS**:
- Charger le parquet dans la fonction si possible en fonction de la liste token_params
- Mettre un argument pour la liste des colonnes à charger sinon

In [73]:
def split_data(df, test_split=0.2, sampling=True, proportion=0.01):
    """
    Split the data into train and test sets
    :param df: Dataframe to split
    :param n_rows: Number of rows in the dataframe
    :return: X_train, X_test, y_train, y_test
    """

    # Split the data with sampling or
    if sampling:
        df = df.sample(frac=proportion, random_state=SEED)

    # Define X and y
    X, y = df.iloc[:, 0], df["target"]

    # Split the data with a 0.2 test size
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_split, stratify=y, random_state=SEED
    )

    # Display shape of splits
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)

    # Return the splits
    return X_train, X_test, y_train, y_test

In [74]:
X_train, X_test, y_train, y_test = split_data(
    df, test_split=0.2, sampling=True, proportion=0.01
)

X_train shape: (12772,)
X_test shape: (3194,)
y_train shape: (12772,)
y_test shape: (3194,)


In [99]:
def df_align(col, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    # Load the df with the column name
    df = pd.read_parquet(
        path,
        columns=[col, "target"],
        engine="pyarrow",
        use_nullable_dtypes=False,
    )
    # Get the indexes to keep the same order after alignment
    train_index = X_train.index
    test_index = X_test.index

    # Split the data with alignment and reindexing to keep the same order
    X_train, _ = df.filter(like=col).align(X_train, join="inner", axis=0)
    X_train = X_train.reindex(train_index)
    X_test, _ = df.filter(like=col).align(X_test, join="inner", axis=0)
    X_test = X_test.reindex(test_index)
    y_train, _ = df.target.align(y_train, join="inner", axis=0)
    y_train = y_train.reindex(train_index)
    y_test, _ = df.target.align(y_test, join="inner", axis=0)
    y_test = y_test.reindex(test_index)

    # Return the aligned and sorted new splits
    return X_train, X_test, y_train, y_test

# **Modélisation**

**COMMENTS**:
- Création du pipeline modulable
- Grille de paramètres pour le vectorizer
- Grille de paramètres pour les modèles (LG, MNB)

In [None]:
experiment = False
if experiment:
    with mlflow.start_run():
        # Load the file
        # Fit and transform the tf-idf vectorizer on the text column
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=1000,
            strip_accents="unicode",
        )
        X_embed = tfidf.fit_transform(X_train["text"].str.lower())
        # Initialize the model
        model = LogisticRegression(max_iter=1000)
        # Add the input example
        input_logit = X_embed[0]
        input_tfidf = X_train["text"].str.lower().values[0]
        # Perform a cross-validation
        scores = cross_validate(
            model, X_embed, y_train, cv=5, scoring=["accuracy", "f1"]
        )
        # Your training code here...
        model.fit(X_embed, y_train)
        y_pred = model.predict(tfidf.transform(X_test["text"].str.lower()))
        scores = pd.DataFrame(scores).mean()
        acc_score = accuracy_score(y_test, y_pred)
        for metric in scores.keys():
            mlflow.log_metric(f"val_{metric}", scores[metric])
        mlflow.log_metric("test_accuracy", acc_score)
        mlflow.log_param("Dimension", X_embed.shape[1])
        mlflow.log_param("Tf-Idf params", tfidf.get_params())
        mlflow.log_param("Logistic Regression params", model.get_params())
        mlflow.sklearn.log_model(
            model, "logistic_regression", input_example=input_logit
        )
        mlflow.sklearn.log_model(
            tfidf, "tfidf_vectorizer", input_example=input_tfidf, signature=False
        )

In [None]:
# Fit and transform the tf-idf vectorizer on the text column
tfidf = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=1000,
    strip_accents="unicode",
)
X_embed = tfidf.fit_transform(X_train["text"].str.lower())



In [None]:
# Initialize the model
model = MultinomialNB()

# Perform a cross-validation
scores = cross_validate(model, X_embed, y_train, cv=5, scoring=["accuracy", "f1"])

# Display the scores
pd.DataFrame(scores).mean()

fit_time         0.003252
score_time       0.002088
test_accuracy    0.722909
test_f1          0.718497
dtype: float64

In [None]:
# Initialize the model
model = LogisticRegression(max_iter=1000)

# Perform a cross-validation
scores = cross_validate(model, X_embed, y_train, cv=5, scoring=["accuracy", "f1"])

# Display the scores
pd.DataFrame(scores).mean()

fit_time         0.023068
score_time       0.002958
test_accuracy    0.735358
test_f1          0.735984
dtype: float64

Comme nous avons énormément de données à analyser, nous allons tester la performance du modèle en utilisant uniquement les features à notre disposition.<br>
Le modèle sera plus rapide s'il reçoit en entrée une sparse matrix plutôt qu'une dense matrix.<br>

# **Options de standardisation des textes**

In [None]:
batch_size = 32
seed = SEED


In [None]:
# Create a text datagenerator for the training set
train_datagen = TextVectorization(
    max_tokens=1000,
    output_sequence_length=100,
    output_mode="int",
)

In [None]:
# Fit the datagenerator on the training set
train_datagen.adapt(texts)

ValueError: in user code:

    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/engine/base_preprocessing_layer.py", line 123, in adapt_step  *
        self.update_state(data)
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/layers/preprocessing/text_vectorization.py", line 470, in update_state  **
        self._lookup_layer.update_state(self._preprocess(data))
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/layers/preprocessing/text_vectorization.py", line 555, in _preprocess
        inputs = self._standardize(inputs)
    File "/tmp/ipykernel_41092/358720669.py", line 8, in spacy_preprocess
        doc = nlp(text)
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/spacy/language.py", line 1040, in __call__
        doc = self._ensure_doc(text)
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/spacy/language.py", line 1134, in _ensure_doc
        raise ValueError(Errors.E1041.format(type=type(doc_like)))

    ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'tensorflow.python.framework.ops.Tensor'>


In [None]:
# Create a text datagenerator for the training set
train_datagen = TextVectorization(
    standardize=custom_standardization,
    max_tokens=1000,
    output_sequence_length=100,
    output_mode="int",
)

In [None]:
# Fit the datagenerator on the training set
train_datagen.adapt(texts)

2024-10-25 18:48:01.481859: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [None]:
# Look at the output of the datagenerator
train_datagen(texts)

<tf.Tensor: shape=(12772, 100), dtype=int64, numpy=
array([[329,  25,  87, ...,   0,   0,   0],
       [449,   1, 468, ...,   0,   0,   0],
       [  1,  62,  19, ...,   0,   0,   0],
       ...,
       [177,  15,   5, ...,   0,   0,   0],
       [ 58,   8, 698, ...,   0,   0,   0],
       [270,   1,   4, ...,   0,   0,   0]])>

In [None]:
# Create a text datagenerator for the training set
train_datagen = TextVectorization(
    standardize=custom_standardization,
    max_tokens=1000,
    output_sequence_length=100,
    output_mode="int",
)

In [None]:
# Fit the datagenerator on the training set
train_datagen.adapt(texts)

2024-10-25 13:28:24.761537: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [None]:
# Look at the output of the datagenerator
train_datagen(texts)

<tf.Tensor: shape=(12772, 100), dtype=int64, numpy=
array([[337,  25,   1, ...,   0,   0,   0],
       [  1,   1,   1, ...,   0,   0,   0],
       [  1,  52,  16, ...,   0,   0,   0],
       ...,
       [176,  14,   5, ...,   0,   0,   0],
       [ 50,   8, 699, ...,   0,   0,   0],
       [361,   1,   4, ...,   0,   0,   0]])>

In [None]:
embedding_dim = 16
max_features = 1000

In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(max_features, embedding_dim),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 16)          16000     
                                                                 
 dropout_3 (Dropout)         (None, None, 16)          0         
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_4 (Dropout)         (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 16,017
Trainable params: 16,017
Non-trainable params: 0
__________________________________________________

In [None]:
model.compile(
    loss=tf.losses.BinaryCrossentropy(),
    optimizer="adam",
    metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)],
)

In [None]:
# compile a train dataset with X_train and y_train to pass it in model.fit
train_dataset = tf.data.Dataset.from_tensor_slices(
    (X_train["text"].values, y_train.values)
)

In [None]:
epochs = 2
history = model.fit(
    train_dataset,
    batch_size=batch_size,
    # validation_data=val_ds,
    epochs=epochs,
)

Epoch 1/2


ValueError: in user code:

    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/engine/training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/hedredo/github_repo/dagshub_p7/.conda/lib/python3.10/site-packages/keras/engine/input_spec.py", line 232, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "sequential_1" "                 f"(type Sequential).
    
    Input 0 of layer "global_average_pooling1d_1" is incompatible with the layer: expected ndim=3, found ndim=1. Full shape received: (16,)
    
    Call arguments received by layer "sequential_1" "                 f"(type Sequential):
      • inputs=tf.Tensor(shape=(), dtype=string)
      • training=True
      • mask=None
