In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer

# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import accuracy_score
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
import mlflow
import mlflow.tensorflow
import mlflow.sklearn
import warnings
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import matplotlib.pyplot as plt
from tensorflow.keras import backend as K
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download

2024-11-22 16:22:16.354540: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-22 16:22:15.359909: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdirectml.d6f03b303ac3c4f2eeb8ca631688c9757b361310.so
2024-11-22 16:22:15.359973: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libdxcore.so
2024-11-22 16:22:15.365150: I tensorflow/c/logging.cc:34] Successfully opened dynamic library libd3d12.so
Dropped Escape call with ulEscapeCode : 0x03007703
Dropped Escape call with ulEscapeCode : 0x03007703
2024-11-22 16:22:15.969853: I tensorflow/c/logging.cc:34] DirectML device enumeration: found 1 compatible adapters.


In [2]:
# Remove FutureWarning alerts
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialiser tqdm pour pandas
tqdm.pandas()

# Set a random seed
SEED = 314
np.random.seed(SEED)
print("Random seed set to", SEED)

Random seed set to 314


In [3]:
# Check if GPU and CUDA are available
gpu = tf.config.list_physical_devices("GPU")
print("Tensorflow framework: GPU is", "available" if gpu else "NOT AVAILABLE")

Tensorflow framework: GPU is available


**COMMENTS**:
- Chargement des données du parquet en entier
- Ou supprimer cette section et passer le chargement du parquet dans séparation des données - split data
- Import de token_params pour les paramètres de tokenisation

In [4]:
# Load the pickle file containing the columns
with open("../data/processed/columns.pkl", "rb") as f:
    cols = pickle.load(f)

In [5]:
# reorder the columns in cols moving the column after hour column
cols = cols.reindex(["hour", "target", "text", *cols[3:]])

In [6]:
cols

(Index(['hour', 'target', 'text', 'tokenizer with lowercase',
        'tokenizer with lowercase, handle stripping, and length reduction',
        'tokenizer with lowercase and alpha',
        'tokenizer with lowercase, alpha and emoji',
        'tokenizer with lowercase, alpha, and no stop words',
        'tokenizer with lowercase, alpha and emoji, and no stop words'],
       dtype='object'),
 array([2, 0, 1, 3, 4, 5, 6, 7, 8]))

In [7]:
path = "../data/processed/df_preprocessed.parquet"
df = pd.read_parquet(
    path,
    columns=["text", "target"],
    engine="pyarrow",
    use_nullable_dtypes=False,
)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1596630 entries, 0 to 799999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1596630 non-null  object
 1   target  1596630 non-null  int8  
dtypes: int8(1), object(1)
memory usage: 25.9+ MB


# **Séparation des données**

**COMMENTS**:
- Charger le parquet dans la fonction si possible en fonction de la liste token_params
- Mettre un argument pour la liste des colonnes à charger sinon

In [11]:
from utils import split_data, load_splits_from_parquet, to_tensorflow_dataset

In [12]:
proportion = 0.10
sampling = True
test_split = 0.2

X_train, X_test, y_train, y_test = split_data(
    df,
    test_split=test_split,
    sampling=sampling,
    proportion=proportion,
)

In [9]:
def split_data(df, test_split=0.2, sampling=True, proportion=0.01, stratify=True):
    """
    Split the data into train and test sets
    :param test_split: Proportion of the data to include in the test split
    :param sampling: Whether to sample the data
    :param proportion: Proportion of the data to sample
    :return: X_train, X_test, y_train, y_test
    """
    # Sample the data if needed to reduce the size
    if sampling:
        df_sample = df.sample(frac=proportion, random_state=SEED)
    else:
        df_sample = df
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        df_sample.iloc[:, 0],
        df_sample.iloc[:, -1],
        test_size=test_split,
        stratify=df_sample.iloc[:, -1] if stratify else None,
        random_state=SEED,
    )
    # Delete the sample dataframe to free up memory
    del df_sample
    # Return the train and test sets
    return X_train, X_test, y_train, y_test

In [1]:
def load_splits_from_parquet(X_train, X_test, y_train, y_test, col_name, path):
    """
    Load and align train and test splits from a parquet file.
    This function reads a parquet file containing a corpus and a target column, 
    aligns the dataframes with the provided train and test splits, and reindexes 
    them to match the original indices.
    Parameters:
    X_train (pd.DataFrame): Training features dataframe.
    X_test (pd.DataFrame): Testing features dataframe.
    y_train (pd.Series): Training target series.
    y_test (pd.Series): Testing target series.
    col_name (str): The name of the column to be used as features.
    path (str): The path to the parquet file.
    Returns:
    tuple: A tuple containing the aligned and reindexed training features, 
           testing features, training target, and testing target.
    """
    # Read the parquet file with corpus
    df = pd.read_parquet(
        path,
        columns=[col_name, "target"],
        engine="pyarrow",
        use_nullable_dtypes=False,
    )
    # Keep track of the indices
    train_index, test_index = X_train.index, X_test.index
    # Align the dataframes and reindex in the same order
    X_train, _ = df.filter(like=col_name).align(X_train, join="inner", axis=0)
    X_train = X_train.reindex(train_index)
    X_test, _ = df.filter(like=col_name).align(X_test, join="inner", axis=0)
    X_test = X_test.reindex(test_index)
    y_train, _ = df.target.align(y_train, join="inner", axis=0)
    y_train = y_train.reindex(train_index)
    y_test, _ = df.target.align(y_test, join="inner", axis=0)
    y_test = y_test.reindex(test_index)

    # Delete the dataframe to free up memory
    del df

    # Return the aligned data with features squeezed to remove the extra dimension if necessary
    return X_train.squeeze(), X_test.squeeze(), y_train, y_test

In [2]:
def to_tensorflow_dataset(X_train, X_test, y_train, y_test, col_name, path, validation_split=0.2, batch_size=32):
    """
    Converts training and testing data into TensorFlow datasets.

    Parameters:
    X_train (pd.DataFrame): Training features.
    X_test (pd.DataFrame): Testing features.
    y_train (pd.DataFrame): Training labels.
    y_test (pd.DataFrame): Testing labels.
    col_name (str): Column name to align splits with the corpus.
    path (str): Path to the parquet file.
    validation_split (float, optional): Fraction of the training data to be used as validation data. Default is 0.2.
    batch_size (int, optional): Number of samples per batch. Default is 32.

    Returns:
    tuple: A tuple containing three TensorFlow datasets (train_ds, val_ds, test_ds).
    """
    # Align the splits with the corpus directly from dataframe
    X_train, X_test, y_train, y_test = load_splits_from_parquet(
        X_train,
        X_test,
        y_train,
        y_test,
        col_name=col_name,
        path=path,
    )
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=validation_split, stratify=y_train, random_state=SEED
    )
    # Create the tensorflow datasets for train, val and test
    train_ds = tf.data.Dataset.from_tensor_slices((X_train_split, y_train_split)).batch(batch_size)
    val_ds = tf.data.Dataset.from_tensor_slices((X_val_split, y_val_split)).batch(batch_size)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
    # Return the tensorflow datasets
    return train_ds, val_ds, test_ds

In [12]:
proportion = 0.10
sampling = True
test_split = 0.2

X_train, X_test, y_train, y_test = split_data(
    df,
    test_split=test_split,
    sampling=sampling,
    proportion=proportion,
)

In [13]:
# Check the size of X_train and X_test
X_train.shape, X_test.shape

((127730,), (31933,))

**COMMENTS**:
- Création du pipeline modulable
- Grille de paramètres pour le vectorizer
- Grille de paramètres pour les modèles (LG, MNB)

## **TF-IDF**

In [15]:
experiment = False
if experiment:
    with mlflow.start_run():
        # Load the file
        # Fit and transform the tf-idf vectorizer on the text column
        tfidf = TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=1000,
            strip_accents="unicode",
        )
        X_embed = tfidf.fit_transform(X_train["text"].str.lower())
        # Initialize the model
        model = LogisticRegression(max_iter=1000)
        # Add the input example
        input_logit = X_embed[0]
        input_tfidf = X_train["text"].str.lower().values[0]
        # Perform a cross-validation
        scores = cross_validate(
            model, X_embed, y_train, cv=5, scoring=["accuracy", "f1"]
        )
        # Your training code here...
        model.fit(X_embed, y_train)
        y_pred = model.predict(tfidf.transform(X_test["text"].str.lower()))
        scores = pd.DataFrame(scores).mean()
        acc_score = accuracy_score(y_test, y_pred)
        for metric in scores.keys():
            mlflow.log_metric(f"val_{metric}", scores[metric])
        mlflow.log_metric("test_accuracy", acc_score)
        mlflow.log_param("Dimension", X_embed.shape[1])
        mlflow.log_param("Tf-Idf params", tfidf.get_params())
        mlflow.log_param("Logistic Regression params", model.get_params())
        mlflow.sklearn.log_model(
            model, "logistic_regression", input_example=input_logit
        )
        mlflow.sklearn.log_model(
            tfidf, "tfidf_vectorizer", input_example=input_tfidf, signature=False
        )

In [16]:
from sklearn.pipeline import Pipeline

In [17]:
def cross_score(X_train, y_train, model, col, cv=10):
    """
    Perform a cross-validation on the model
    :param X_train: Training data
    :param y_train: Training target
    :param model: Model to train
    :param cv: Number of folds
    :return: scores
    """
    # Perform a cross-validation
    scores = cross_validate(model, X_train, y_train, cv=cv, scoring=["accuracy", "f1"])
    # Return the mean scores
    return pd.DataFrame(scores).mean().rename(col)

In [18]:
# create a pipeline with Tf-Idf and Logistic Regression
model = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                ngram_range=(1, 3),
                max_features=1000,
                strip_accents="unicode",
            ),
        ),
        ("logistic", LogisticRegression(max_iter=1000)),
    ]
)

In [19]:
scores_list = []
for col in tqdm(cols[0][2:]):
    # Align the data
    if col != "text":
        X_train, X_test, y_train, y_test = splitter.align_splits_from_df(
            X_train, X_test, y_train, y_test, col
        )
    # Execute the cross-score function
    scores = cross_score(X_train, y_train, model, col, cv=10)
    # Display the scores
    scores_list.append(scores)
# Concatenate the scores
scores_df = pd.concat(scores_list, axis=1)
# Display the scores
scores_df

  0%|          | 0/7 [00:01<?, ?it/s]


KeyboardInterrupt: 

Le retrait de certain stop words affecte probablement la capacité du modèle à prédire correctement le sentiment.<br>
Par exemple, le mot "not" est un stop word et est retiré de la liste des mots à analyser. Cela peut affecter la prédiction du sentiment si le mot "not" est important pour la prédiction du sentiment.

In [None]:
# create a pipeline with Tf-Idf and Logistic Regression
model = Pipeline(
    [
        (
            "tfidf",
            TfidfVectorizer(
                ngram_range=(1, 3),
                max_features=1000,
                strip_accents="unicode",
            ),
        ),
        ("MultinomialNB", MultinomialNB()),
    ]
)

In [None]:
scores_list = []
for col in tqdm(cols[0][2:]):
    # Align the data
    X_train, X_test, y_train, y_test = splitter.align_splits_from_df(
        X_train, X_test, y_train, y_test, col
    )
    # Execute the cross-score function
    scores = cross_score(X_train, y_train, model, col, cv=10)
    # Display the scores
    scores_list.append(scores)
# Concatenate the scores
scores_df = pd.concat(scores_list, axis=1)
# Display the scores
scores_df

100%|██████████| 7/7 [02:45<00:00, 23.68s/it]


Unnamed: 0,text,tokenizer with lowercase,"tokenizer with lowercase, handle stripping, and length reduction",tokenizer with lowercase and alpha,"tokenizer with lowercase, alpha and emoji","tokenizer with lowercase, alpha, and no stop words","tokenizer with lowercase, alpha and emoji, and no stop words"
fit_time,2.724755,2.84383,2.538568,2.455027,2.245535,1.202892,1.473164
score_time,0.096113,0.09904,0.090046,0.083166,0.083293,0.051234,0.049753
test_accuracy,0.738229,0.739121,0.738229,0.737368,0.737368,0.719173,0.719173
test_f1,0.728693,0.730365,0.728495,0.730633,0.730633,0.710744,0.710744


## **CUSTOM NN**

In [15]:
URI = "http://localhost:5000"

In [19]:
# define the name of your experiment
experiment = 'neural_network_scratch_embedding'

# Set the tracking URI
mlflow.set_tracking_uri(URI)
    # try to connect to the server
try:
    mlflow.tracking.get_tracking_uri()
except Exception as e:
    print(f"Cannot connect to the server : {URI}. Check the server status.")
    raise e
# Set, and create if necessary, the experiment
try:
    mlflow.create_experiment(experiment)
except:
    pass
mlflow.set_experiment(experiment)

<Experiment: artifact_location='mlflow-artifacts:/951305245308831132', creation_time=1731164356530, experiment_id='951305245308831132', last_update_time=1731164356530, lifecycle_stage='active', name='neural_network_scratch_embedding', tags={}>

In [20]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32

# Args for the model
max_tokens = 5000
seq_length = 100
embedding_dim = 16
embedding_trainable = True
epochs = 15
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D", "GlobalMaxPooling1D")

In [21]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the datasets
    train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers)
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        with tf.device('/GPU:0'): 
        # Fit the model
            history = model.fit(
                        train_ds,
                        validation_data=val_ds,
                        epochs=epochs,
                        verbose=1,
                    )
            mlflow.log_param("batch_size_", batch_size)
            mlflow.log_param("validation_split_",val_split)

            # Evaluate the model
            print("Evaluate on test data")
            print("==============")
            loss, accuracy = model.evaluate(test_ds)
            mlflow.log_metric("test_loss", loss)
            mlflow.log_metric("test_accuracy", accuracy)

2024-11-22 15:30:02.875620: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-22 15:30:02.876771: I tensorflow/c/logging.cc:34] DirectML: creating device on adapter 0 (AMD Radeon RX 6700 XT)
Dropped Escape call with ulEscapeCode : 0x03007703
2024-11-22 15:30:02.953779: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-22 15:30:02.953811: W tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc:37] Ignoring the value of TF_FORCE_GPU_ALLOW_GROWTH because force_memory_growth was requested by the device.
2024-11-22 15:30:02.953833: I t

Vocabulary size:  5000
Epoch 1/15
   2/3194 [..............................] - ETA: 3:31 - loss: 0.6941 - binary_accuracy: 0.4688 

2024-11-22 15:30:07.464260: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-22 15:30:07.518930: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-22 15:30:07.518979: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25406 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-22 15:30:07.520350: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-22 15:30:07.520398: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-22 15:30:47.720505: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-22 15:30:47.742656: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-22 15:30:47.742707: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25406 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
 652/3194 [=====>........................] - ETA: 1:24 - loss: 0.4570 - binary_accuracy: 0.7982

: 

In [29]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = None
seq_length = 100
embedding_dim = 16
embedding_trainable = True
epochs = 10
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D with max tokens up to vocab size", "GlobalMaxPooling1D with max tokens up to vocab size")

In [30]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers)
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-22 13:12:41.266907: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  13729
Epoch 1/10
  1/320 [..............................] - ETA: 2:11 - loss: 0.6923 - binary_accuracy: 0.5625

2024-11-22 13:12:42.269346: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-22 13:12:42.320667: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-22 13:12:42.320725: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-22 13:12:42.323086: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-22 13:12:42.323126: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



: 

In [None]:
# Libérer la mémoire GPU et réinitialiser le graphe par défaut
K.clear_session()
tf.compat.v1.reset_default_graph()

In [34]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = 5000
seq_length = 100
embedding_dim = 16
embedding_trainable = True
epochs = 3
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D", "GlobalMaxPooling1D")

In [36]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers)
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-09 17:30:26.817384: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3
 11/320 [>.............................] - ETA: 3s - loss: 0.6933 - binary_accuracy: 0.4972

2024-11-09 17:30:27.688117: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:30:27.731275: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:27.731323: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:30:27.732863: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:27.732895: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:30:31.361600: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:30:31.378384: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:31.378421: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:30:42.458121: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpdc2n8q85/model/data/model/assets




Evaluate on test data


2024/11/09 17:30:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalAveragePooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/2e98affd63eb49b08dc85d91082eb816.
2024/11/09 17:30:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.
2024-11-09 17:30:45.843025: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3
 11/320 [>.............................] - ETA: 3s - loss: 0.6938 - binary_accuracy: 0.4943

2024-11-09 17:30:46.683277: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:30:46.725649: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:46.725690: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-09 17:30:46.726960: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:46.726991: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-09 17:30:50.986825: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-09 17:30:51.003672: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-09 17:30:51.003715: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-09 17:30:59.112651: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpohs4si10/model/data/model/assets




Evaluate on test data


2024/11/09 17:31:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalMaxPooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/f25d1e2563a34c78b0ad86e5a0cb609b.
2024/11/09 17:31:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.


## **GLOVE EMBEDDINGS**

In [41]:
# define the name of your experiment
experiment = 'neural_network_scratch_embedding'

# Set the tracking URI
mlflow.set_tracking_uri(URI)
    # try to connect to the server
try:
    mlflow.tracking.get_tracking_uri()
except Exception as e:
    print(f"Cannot connect to the server : {URI}. Check the server status.")
    raise e
# Set, and create if necessary, the experiment
try:
    mlflow.create_experiment(experiment)
except:
    pass
mlflow.set_experiment(experiment)

<Experiment: artifact_location='mlflow-artifacts:/951305245308831132', creation_time=1731164356530, experiment_id='951305245308831132', last_update_time=1731164356530, lifecycle_stage='active', name='neural_network_scratch_embedding', tags={}>

In [42]:
# Load the glove-twitter-100 model
repo_id = "fse/glove-twitter-100"
model_file = hf_hub_download(repo_id=repo_id, filename="glove-twitter-100.model")
vector_file = hf_hub_download(
    repo_id=repo_id, filename="glove-twitter-100.model.vectors.npy"
)
glove = KeyedVectors.load(model_file, mmap="r")

In [43]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = 5000
seq_length = 100
embedding_dim = 100
embedding_trainable = False
epochs = 3
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D", "GlobalMaxPooling1D")

In [44]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers, pretrained_weights=glove)
    # set the embedding layer trainable or not
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-15 08:31:03.348656: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3


2024-11-15 08:31:04.347457: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-15 08:31:04.393588: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:31:04.393636: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-15 08:31:04.394816: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:31:04.394848: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f







2024-11-15 08:31:20.894841: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-15 08:31:20.912436: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:31:20.912472: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-15 08:31:49.025465: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpw7glr470/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpw7glr470/model/data/model/assets


Evaluate on test data


2024/11/15 08:31:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalAveragePooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/4810a38cb8d44cc1ab89ca5cb195f89b.
2024/11/15 08:31:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.
2024-11-15 08:31:55.343468: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3
  3/320 [..............................] - ETA: 13s - loss: 0.8577 - binary_accuracy: 0.5312 

2024-11-15 08:31:56.183398: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-15 08:31:56.227973: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:31:56.228022: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-15 08:31:56.229208: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:31:56.229241: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f







2024-11-15 08:32:09.447390: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-15 08:32:09.464986: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:32:09.465040: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-15 08:32:39.618323: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmpo0bbg_7h/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpo0bbg_7h/model/data/model/assets


Evaluate on test data


2024/11/15 08:32:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalMaxPooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/16ee04974e114b37baf27955c53a7524.
2024/11/15 08:32:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.


## **FASTTEXT EMBEDDINGS + CUSTOM NN**

In [29]:
import fasttext

In [32]:
# Load fasttext embeddings trained on twitter data
model_path = hf_hub_download(
    repo_id="facebook/fasttext-en-vectors", filename="model.bin"
)
fastxt = fasttext.load_model(model_path)

In [None]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = 5000
seq_length = 100
embedding_dim = 300
embedding_trainable = False
epochs = 3
additionnal_layers = [(tf.keras.layers.GlobalAveragePooling1D(),), (tf.keras.layers.GlobalMaxPooling1D(),),]
runs = ("GlobalAveragePooling1D", "GlobalMaxPooling1D")

In [40]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers, pretrained_weights=fastxt)
    # set the embedding layer trainable or not
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    verbose=1,
                )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-15 08:07:53.902185: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3


2024-11-15 08:07:54.894560: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-15 08:07:54.963887: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:07:54.964204: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-15 08:07:54.967326: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:07:54.967369: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



2024-11-15 08:08:35.003325: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-15 08:08:35.022149: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:08:35.022196: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)


Epoch 2/3
Epoch 3/3


2024-11-15 08:11:33.948292: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: /tmp/tmp6qbj5tsq/model/data/model/assets




Evaluate on test data


2024/11/15 08:11:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run GlobalAveragePooling1D at: http://localhost:5000/#/experiments/951305245308831132/runs/b2c0e220bed04349a5fef17951492a7d.
2024/11/15 08:11:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/951305245308831132.
2024-11-15 08:11:40.400349: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  5000
Epoch 1/3


2024-11-15 08:11:41.281596: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-15 08:11:41.325276: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:11:41.325319: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-15 08:11:41.326524: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 08:11:41.326558: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f



: 

## BIDERECTIONAL LSTM

In [23]:
# define the name of your experiment
experiment = 'neural_network_SEQ_embedding'

# Set the tracking URI
mlflow.set_tracking_uri(URI)
    # try to connect to the server
try:
    mlflow.tracking.get_tracking_uri()
except Exception as e:
    print(f"Cannot connect to the server : {URI}. Check the server status.")
    raise e
# Set, and create if necessary, the experiment
try:
    mlflow.create_experiment(experiment)
except:
    pass
mlflow.set_experiment(experiment)

<Experiment: artifact_location='mlflow-artifacts:/846190965426187584', creation_time=1731654957941, experiment_id='846190965426187584', last_update_time=1731654957941, lifecycle_stage='active', name='neural_network_SEQ_embedding', tags={}>

In [20]:
# Args for dataset preparation
col_name = "text"
val_split = 0.2
batch_size = 32
# Create the datasets
train_ds, val_ds, test_ds = prepare_tf_dataset(col_name, val_split, batch_size)

# Args for the model
max_tokens = 1000
seq_length = 100
embedding_dim = 16
embedding_trainable = True
epochs = 15
additionnal_layers = [(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8, return_sequences=True)), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8))),]
runs = ("BiderectionalLSTM", )

In [25]:
for layers, run_name in zip(additionnal_layers, runs):
    # Create the model
    model = create_tf_model(max_tokens, seq_length, embedding_dim, layers)
    model.get_layer("embedding").trainable = embedding_trainable
    mlflow.tensorflow.autolog(checkpoint=False, log_models=True)
    with mlflow.start_run(run_name=run_name):
        # Fit the model
        history = model.fit(
                            train_ds,
                            validation_data=val_ds,
                            epochs=epochs,
                            verbose=1,
                        )
        mlflow.log_param("batch_size_", batch_size)
        mlflow.log_param("validation_split_",val_split)

        # Evaluate the model
        print("Evaluate on test data")
        print("==============")
        loss, accuracy = model.evaluate(test_ds)
        mlflow.log_metric("test_loss", loss)
        mlflow.log_metric("test_accuracy", accuracy)

2024-11-15 19:59:42.045555: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Vocabulary size:  1000




Epoch 1/15


2024-11-15 19:59:50.504612: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-15 19:59:50.831360: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 19:59:50.831405: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 25405 MB memory) -> physical PluggableDevice (device: 0, name: DML, pci bus id: <undefined>)
2024-11-15 19:59:50.832566: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 19:59:50.832599: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_f

InvalidArgumentError: Graph execution error:

No OpKernel was registered to support Op 'CudnnRNN' used by {{node CudnnRNN}} with these attrs: [T=DT_FLOAT, input_mode="linear_input", direction="unidirectional", rnn_mode="lstm", seed2=0, is_training=true, dropout=0, seed=0]
Registered devices: [CPU, GPU]
Registered kernels:
  <no registered kernels>

	 [[CudnnRNN]]
	 [[sequential_2/bidirectional_2/forward_lstm_2/PartitionedCall]] [Op:__inference_train_function_55753]

## SENTENCE TRANSFORMER

In [46]:
from sentence_transformers import SentenceTransformer

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [65]:
# Load the x_train data aligned on cleaned text corpus
X_train, X_test, y_train, y_test = load_splits_from_parquet(X_train, X_test, y_train, y_test, col_name='tokenizer with lowercase')

In [66]:
model = SentenceTransformer("all-MiniLM-L6-v2")
X_train_encoded = model.encode(X_train.values, device='cpu')
X_test_encoded = model.encode(X_test.values, device='cpu')

In [67]:
rfc = RandomForestClassifier()
rfc.fit(X_train_encoded, y_train)
preds = rfc.predict(X_test_encoded)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.73      0.76      0.75      1601
           1       0.75      0.72      0.74      1593

    accuracy                           0.74      3194
   macro avg       0.74      0.74      0.74      3194
weighted avg       0.74      0.74      0.74      3194



In [68]:
logit = LogisticRegression()
logit.fit(X_train_encoded, y_train)
preds = logit.predict(X_test_encoded)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.78      0.77      0.78      1601
           1       0.77      0.79      0.78      1593

    accuracy                           0.78      3194
   macro avg       0.78      0.78      0.78      3194
weighted avg       0.78      0.78      0.78      3194



In [59]:
cols

(Index(['hour', 'target', 'text', 'tokenizer with lowercase',
        'tokenizer with lowercase, handle stripping, and length reduction',
        'tokenizer with lowercase and alpha',
        'tokenizer with lowercase, alpha and emoji',
        'tokenizer with lowercase, alpha, and no stop words',
        'tokenizer with lowercase, alpha and emoji, and no stop words'],
       dtype='object'),
 array([2, 0, 1, 3, 4, 5, 6, 7, 8]))

In [61]:
# Load the x_train data aligned on cleaned text corpus
X_train, X_test, y_train, y_test = load_splits_from_parquet(X_train, X_test, y_train, y_test, col_name='tokenizer with lowercase, handle stripping, and length reduction')

In [62]:
model = SentenceTransformer("all-MiniLM-L6-v2")
X_train_encoded = model.encode(X_train.values, device='cpu')
X_test_encoded = model.encode(X_test.values, device='cpu')

In [63]:
rfc = RandomForestClassifier()
rfc.fit(X_train_encoded, y_train)
preds = rfc.predict(X_test_encoded)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.74      0.74      0.74      1601
           1       0.74      0.74      0.74      1593

    accuracy                           0.74      3194
   macro avg       0.74      0.74      0.74      3194
weighted avg       0.74      0.74      0.74      3194



In [64]:
logit = LogisticRegression()
logit.fit(X_train_encoded, y_train)
preds = logit.predict(X_test_encoded)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1601
           1       0.77      0.77      0.77      1593

    accuracy                           0.77      3194
   macro avg       0.77      0.77      0.77      3194
weighted avg       0.77      0.77      0.77      3194



In [83]:
from datasets import Dataset


In [104]:
pd.concat([X_train, y_train], axis=1).to_parquet("../data/processed/X_train.parquet")
pd.concat([X_test, y_test], axis=1).to_parquet("../data/processed/X_test.parquet")

In [105]:
dataset = load_dataset("parquet", data_files={"train": "../data/processed/X_train.parquet", "test": "../data/processed/X_test.parquet"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [106]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/12772 [00:00<?, ? examples/s]

Map:   0%|          | 0/3194 [00:00<?, ? examples/s]

In [107]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [108]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [109]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
)

# Define the Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=tokenized_datasets['train'],         # Training dataset
    tokenizer=tokenizer,                 # Tokenizer
)

# Train the model
trainer.train()