## **Catégorisez automatiquement des questions**

### partie 4/8 : Prédiction de tags, approche supervisée + tracking mlflow

#### <br> Notebook d’exploration et de pré-traitement des questions, comprenant une analyse univariée et multivariée, un nettoyage des questions, un feature engineering de type bag of words avec réduction de dimension (du vocabulaire et des tags) 

<br>


## Import librairies


In [1]:
import os, sys, random
import ast
import numpy as np
import pandas as pd
from typing import Tuple
from pandarallel import pandarallel
from pprint import pprint
import json
import requests
from collections import Counter

# Visualisation
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from PIL import Image

# NLP
from bs4 import BeautifulSoup
import re, string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

#
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim import similarities
from gensim.models.ldamulticore import LdaMulticore

#
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, PredictionErrorDisplay
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsRegressor

print('\nPython version ' + sys.version)
print('pyLDAvis version ' + pyLDAvis.__version__)

# Modify if necessary
num_cores = os.cpu_count()
print(f"\nNumber of CPU cores: {num_cores}")
pandarallel.initialize(progress_bar=False, nb_workers=6)

#
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import infer_signature, ModelSignature #, Schema, ParamSchema
from mlflow.types import Schema, ParamSchema, ParamSpec, ColSpec

# os.environ['MLFLOW_TRACKING_URI'] = './'

# ! REQUIRES CONSOLE COMMAND : mlflow ui
# Utilisable seulement en local...
mlflow.set_tracking_uri("http://localhost:5000")
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



Python version 3.11.5 (main, Sep 11 2023, 13:23:44) [GCC 11.2.0]
pyLDAvis version 3.4.0

Number of CPU cores: 8
INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### functions


In [2]:
def create_mlflow_experiment(
    experiment_name: str, artifact_location: str, tags: dict[str, str]
) -> str:
    """
    Create a new mlflow experiment with the given name and artifact location.

    Parameters:
    ----------
    experiment_name: str
        The name of the experiment to create.
    artifact_location: str
        The artifact location of the experiment to create.
    tags: dict[str,Any]
        The tags of the experiment to create.

    Returns:
    -------
    experiment_id: str
        The id of the created experiment.
    """
    try:
        experiment_id = mlflow.create_experiment(
            name=experiment_name, artifact_location=artifact_location, tags=tags
        )
    except:
        print(f"Experiment {experiment_name} already exists.")
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

    mlflow.set_experiment(experiment_name=experiment_name)

    return experiment_id


def get_mlflow_experiment(
    experiment_id: str = None, experiment_name: str = None
) -> mlflow.entities.Experiment:
    """
    Retrieve the mlflow experiment with the given id or name.

    Parameters:
    ----------
    experiment_id: str
        The id of the experiment to retrieve.
    experiment_name: str
        The name of the experiment to retrieve.

    Returns:
    -------
    experiment: mlflow.entities.Experiment
        The mlflow experiment with the given id or name.
    """
    if experiment_id is not None:
        experiment = mlflow.get_experiment(experiment_id)
    elif experiment_name is not None:
        experiment = mlflow.get_experiment_by_name(experiment_name)
    else:
        raise ValueError("Either experiment_id or experiment_name must be provided.")

    return experiment


def turn_str_back_into_list(df):
    """Correct the type change due to .csv export"""

    df['title_nltk'] = df['title_nltk'].apply(ast.literal_eval)
    df['body_nltk'] = df['body_nltk'].apply(ast.literal_eval)
    df['title_spacy'] = df['title_spacy'].apply(ast.literal_eval)
    df['body_spacy'] = df['body_spacy'].apply(ast.literal_eval)



In [3]:
# Again, this needs mlfow ui console command first -> unusable on remote server
# all_experiments = client.search_experiments()
# pprint(all_experiments)


### import data


In [4]:
train = pd.read_csv('./../data/cleaned_data/train_bow_uniques.csv', sep=',')
test = pd.read_csv('./../data/cleaned_data/test_bow_uniques.csv', sep=',')

turn_str_back_into_list(train)
turn_str_back_into_list(test)

train.head()


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
0,2019-06-05 15:13:02,How to use memset while handling strings in C++?,I am from Python background and recently learn...,"['c++', 'initialization', 'c-strings', 'string...","[memset, handle, string]","[memset, handle, string, python, background, l...","[use, memset, handle, string]","[background, learn, function, memset, follow, ..."
1,2018-10-31 12:35:02,How to correct spelling in google docs using k...,I would like to be able to replace a misspelle...,"['gmail', 'keyboard-shortcuts', 'google-docs',...","[correct, spell, google, doc, keyboard, shortcut]","[correct, spell, google, doc, shortcut, like, ...","[correct, spelling, keyboard, shortcut]","[like, replace, word, recommend, correction, k..."
2,2020-09-19 10:40:23,live server vscode on another computer,I have 2 computers. when I open the project wi...,"['visual-studio-code', 'server', 'localhost', ...","[server, vscode, computer]","[server, vscode, computer, open, project, give...","[server, vscode, computer]","[computer, open, project, server, url, want, b..."
3,2012-10-23 16:47:04,django ajax post 403 forbidden,using django 1.4 im getting a 403 error when i...,"['javascript', 'ajax', 'django', 'http-post', ...","[django, ajax, forbidden]","[django, ajax, get, error, try, post, javascri...",[forbid],"[django, error, try, post, javascript, server,..."
4,2019-04-21 16:10:24,Listen to changes and reload container on code...,I am using docker-compose in visual studio 201...,"['angular', 'visual-studio', 'docker', 'docker...","[listen, change, reload, container, code, dock...","[listen, change, reload, container, code, dock...","[listen, change, reload, container, code, dock...","[docker, compose, studio, window, run, contain..."


## Classic ML models


### knn


In [6]:
# Often gives good results if enough data
# Accepts basically any input, as long as it is numerical

# => Perfect for testing different embeddings !

# add random state
# add grid search cv
# add other score ? silhouette ? ...

def predict_tags_using_knn(df, feature, target, k=5, alea=42, exemple=None):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    # Ici on ne va pas demander au knn de faire de prediction,
    # On veut juste qu'il trouve les voisins.
    # Mais la fonction fit a besoin de targets en param
    target_values = df[target].values

    # Initialize kNN model
    knn_model = KNeighborsRegressor(n_neighbors=k, metric='cosine', algorithm='brute')
    knn_model.fit(dense_matrix, target_values)

    # Example query
    query_document = exemple
    query_bow = gensim_dictionary.doc2bow(query_document)
    query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

    # Find nearest neighbors
    _, indices = knn_model.kneighbors(query_vector)

    # Aggregate tags from neighbors
    neighbor_tags = [df.iloc[i][target] for i in indices.flatten()]

    # Predict tags based on most common tags among neighbors
    predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common()]

    return predicted_tags

exemple = ["your", 'text', 'document']
# Call the function with your DataFrame and the desired text feature and target tags
predicted_tags = predict_tags_using_knn(train, 'title_nltk', 'all_tags', exemple=exemple)
print(predicted_tags)


["['c#', '.net', 'ms-word', 'openxml', 'openxml-sdk']", "['c#', '.net', 'windows', 'f#', 'console']", "['python', 'module', 'preprocessor', 'nlp', 'stemming']", "['python', 'text', 'replace', 'ms-word', 'python-docx']", "['c#', '.net', 'html', 'pdf', 'extract']"]


In [None]:
# Define the plotting function
def plot_performance_vs_neighbors(grid_search):
    # Extract the results from the GridSearchCV object
    results = grid_search.cv_results_

    # Extract the parameters and scores for both uniform and distance weights
    params_uniform = [param for param in results['params'][::2]]
    params_distance = [param for param in results['params'][1::2]]
    test_scores_uniform = results['mean_test_r2'][::2]
    test_scores_distance = results['mean_test_r2'][1::2]

    # Extract the parameter values for uniform and distance weights
    n_neighbors_uniform = [param['knn_regressor__n_neighbors'] for param in params_uniform]
    n_neighbors_distance = [param['knn_regressor__n_neighbors'] for param in params_distance]

    # Create separate plots for uniform and distance weights
    plt.figure(figsize=(12, 6))

    # Plot for uniform weight
    plt.subplot(1, 2, 1)
    plt.plot(n_neighbors_uniform, test_scores_uniform, marker='o', linestyle='-')
    plt.title("Performance vs. Number of Neighbors (Uniform Weight)")
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Mean Test R-squared")
    plt.grid(True)

    # Plot for distance weight
    plt.subplot(1, 2, 2)
    plt.plot(n_neighbors_distance, test_scores_distance, marker='o', linestyle='-')
    plt.title("Performance vs. Number of Neighbors (Distance Weight)")
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Mean Test R-squared")
    plt.grid(True)

    plt.tight_layout()
    plt.show()


def preprocessing(df=train, feature='title_nltk', target='all_tags', embedding='bow_dense'):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    # Ici on ne va pas demander au knn de faire de prediction,
    # On veut juste qu'il trouve les voisins.
    # Mais la fonction fit a besoin de targets en param
    target_values = df[target].values

    return gensim_dictionary, corpus, target_values


def pipe_knn(df=train, feature='title_nltk', target='all_tags', embedding='bow_dense', metric='cosine', graph=True):

    # Séparation des jeux de données entrainement / validation, preprocessing
    gensim_dictionary, corpus, target_values = preprocessing()

    # Create a KNN Regressor
    knn_regressor = KNeighborsRegressor(metric=metric)

    # Create a pipeline with preprocessing and a knn regressor
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("knn_regressor", knn_regressor)
    ])

    # Define hyperparameters and their possible values for grid search
    param_grid = {
        'knn_regressor__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'knn_regressor__weights': ['uniform', 'distance']
    }

    # Create the GridSearchCV object with multiple scoring metrics
    scoring = {'neg_mean_squared_error': 'neg_mean_squared_error', 'r2': 'r2'}
    grid_search = GridSearchCV(pipe, param_grid=param_grid,
                            scoring=scoring, cv=5, refit='r2', verbose=1)

    # Fit the GridSearchCV object to your training data to perform hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Access the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    # Create the KNN regressor with the best hyperparameters
    best_knn_regressor = KNeighborsRegressor(metric=metric,
                                             n_neighbors=best_params['knn_regressor__n_neighbors'],
                                             weights=best_params['knn_regressor__weights'])

    # Create a pipeline with the preprocessor and the tuned knn regressor
    pipeline_with_tuned_knn = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("knn_regressor", best_knn_regressor)  # Use the tuned neighbor and weight values here
    ])

    # Perform cross-validation (on training set) and display the scores for each split
    scoring = ['r2', 'neg_mean_squared_error']
    cv_scores = cross_validate(pipeline_with_tuned_knn, X_train, y_train, cv=5, scoring=scoring)
    # print("Cross-Validation Scores (training):", '\n', cv_scores)
    print("Cross-Validation Scores:")
    for i, score in enumerate(cv_scores['test_r2']):
        print(f"Split {i+1} : r2 = {score}")

    r2_val = cv_scores['test_r2'].mean()
    mse_val = -cv_scores['test_neg_mean_squared_error'].mean()
    rmse_val = np.sqrt(mse_val)

    # fit model on training set
    time_fit = fit_and_timeit(pipeline_with_tuned_knn, X_train, y_train)
    # Make predictions
    y_pred, time_predict = predict_and_timeit(pipeline_with_tuned_knn, X_test)

    # Calculate scores on training
    r2_train, rmse_train = calcul_scores(y_train, pipeline_with_tuned_knn.predict(X_train))
    # and testing set
    r2_test, rmse_test = calcul_scores(y_test, y_pred)

    # Display results
    print(f"R-squared (val) =  {r2_val}")
    print(f"R-squared (train) =  {r2_train}")
    print(f"R-squared (test) =  {r2_test}")
    print(f"RMSE (val) =  {rmse_val}")
    print(f"RMSE (train) =  {rmse_train}")
    print(f"RMSE (test) =  {rmse_test}" '\n')

    # display results/error as graph on first iteration (if asked to)
    if alea == 0 and graph:
        plot_performance_vs_neighbors(grid_search)
        plot_predictions(r2_train, r2_test, y_pred, y_test, kind='actual_vs_predicted', y=y)
        # plot_predictions(r2_train, r2_test, y_pred, y_test, kind='residual_vs_predicted', y=y)

    # Return scores for this random state
    return r2_val, rmse_val, r2_train, rmse_train, r2_test, rmse_test, time_fit, time_predict


def test_knn_n_times(y=y_E, scaler=robust, graph=False, metric='euclidean'):
    print(f'Modèle : kNN')
    print('target : ', y.name)

    results_r2_val, results_rmse_val,  results_r2_train, results_rmse_train = [], [], [], []
    results_r2_test, results_rmse_test, results_time_fit, results_time_predict = [], [], [], []

    for n in range(nb_iter):
        print('Iteration ', n+1)
        r2_val, rmse_val, r2_train, rmse_train, r2_test, rmse_test, time_fit, time_predict = pipe_knn(alea=n,
                                                                                      y=y,
                                                                                      scaler=scaler,
                                                                                      graph=graph,
                                                                                      metric=metric)
        results_r2_val.append(r2_val)
        results_rmse_val.append(rmse_val)
        results_r2_train.append(r2_train)
        results_rmse_train.append(rmse_train)
        results_r2_test.append(r2_test)
        results_rmse_test.append(rmse_test)
        results_time_fit.append(time_fit)
        results_time_predict.append(time_predict)

    # Calculate means and std devs
    r2_val_moy = np.mean(results_r2_val)
    rmse_val_moy = np.mean(results_rmse_val)
    r2_train_moy = np.mean(results_r2_train)
    rmse_train_moy = np.mean(results_rmse_train)
    r2_test_moy = np.mean(results_r2_test)
    rmse_test_moy = np.mean(results_rmse_test)
    time_fit_moy = np.mean(results_time_fit)
    time_predict_moy = np.mean(results_time_predict)

    r2_val_std = np.std(results_r2_val)
    rmse_val_std = np.std(results_rmse_val)
    r2_train_std = np.std(results_r2_train)
    rmse_train_std = np.std(results_rmse_train)
    r2_test_std = np.std(results_r2_test)
    rmse_test_std = np.std(results_rmse_test)
    time_fit_std = np.std(results_time_fit)
    time_predict_std = np.std(results_time_predict)

    # Mise en forme
    results = {'model': 'kNN',
               'set': dataset,
               'scaler': scaler,
               'target': y.name,
               'r2_test_moy': r2_test_moy,
               'r2_test_std': r2_test_std,
               'rmse_test_moy': rmse_test_moy,
               'rmse_test_std': rmse_test_std,
               'r2_train_moy': r2_train_moy,
               'r2_train_std': r2_train_std,
               'rmse_train_moy': rmse_train_moy,
               'rmse_train_std': rmse_train_std,
               'r2_val_moy': r2_val_moy,
               'r2_val_std': r2_val_std,
               'rmse_val_moy': rmse_val_moy,
               'rmse_val_std': rmse_val_std,
               'time_fit_moy': time_fit_moy,
               'time_fit_std': time_fit_std,
               'time_predict_moy': time_predict_moy,
               'time_predict_std': time_predict_std,
               }

    print(results, '\n')

    # Append a new row for this model
    model_results.append(results)

test_knn_n_times(scaler=robust, graph=True)
# test_knn_n_times(y=y_EI, scaler=robust, graph=True)

affichage_results()

# 0.4, ce qui est bien, mais pas top.
# dataset trop petit pour un knn ? (relativement peu d'individus)


In [None]:

def suggest_topics_using_knn(df, feature, alea=42):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
    id2word = gensim_dictionary.id2token

    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every, random_state=alea)

    top_topics = model.top_topics(corpus, topn=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    # = umass if same topn (default 20)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    # Compute Coherence Score (Umass)
    coherence_umass = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='u_mass')
    coherence_lda_umass = coherence_umass.get_coherence()
    print('u_mass Coherence Score: %.4f.' % coherence_lda_umass)

    # Compute Coherence Score (cv)
    coherence_cv = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_v')
    coherence_lda_cv = coherence_cv.get_coherence()
    print('c_v Coherence Score: %.4f.' % coherence_lda_cv)

    # Compute Coherence Score (npmi)
    coherence_npmi = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_npmi')
    coherence_lda_npmi = coherence_npmi.get_coherence()
    print('c_npmi Coherence Score: %.4f.' % coherence_lda_npmi)

    # Perplexity is not a coherence score but a measure of how well the model predicts a sample.
    # A lower perplexity indicates better model performance.
    perplexity = model.log_perplexity(corpus)
    print('Perplexity: %.4f.' % perplexity)

    # Visualize the topics
    vis_data = gensimvis.prepare(model, corpus, gensim_dictionary)
    display(pyLDAvis.display(vis_data))

    # Uncomment the next line if you want to save the plot to a file
    # pyLDAvis.save_html(vis_data, 'artifacts/lda_vis.html')

    pprint(top_topics)
    # to print all topics
    # pprint(model.print_topics())

    return model, corpus, gensim_dictionary

lda_test, corpus_test, dict_test = suggest_topics_using_LDA(train, 'title_nltk')

In [None]:
# Define the plotting function
def plot_performance_vs_neighbors(grid_search):
    # Extract the results from the GridSearchCV object
    results = grid_search.cv_results_

    # Extract the parameters and scores for both uniform and distance weights
    params_uniform = [param for param in results['params'][::2]]
    params_distance = [param for param in results['params'][1::2]]
    test_scores_uniform = results['mean_test_r2'][::2]
    test_scores_distance = results['mean_test_r2'][1::2]

    # Extract the parameter values for uniform and distance weights
    n_neighbors_uniform = [param['knn_regressor__n_neighbors'] for param in params_uniform]
    n_neighbors_distance = [param['knn_regressor__n_neighbors'] for param in params_distance]

    # Create separate plots for uniform and distance weights
    plt.figure(figsize=(12, 6))

    # Plot for uniform weight
    plt.subplot(1, 2, 1)
    plt.plot(n_neighbors_uniform, test_scores_uniform, marker='o', linestyle='-')
    plt.title("Performance vs. Number of Neighbors (Uniform Weight)")
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Mean Test R-squared")
    plt.grid(True)

    # Plot for distance weight
    plt.subplot(1, 2, 2)
    plt.plot(n_neighbors_distance, test_scores_distance, marker='o', linestyle='-')
    plt.title("Performance vs. Number of Neighbors (Distance Weight)")
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Mean Test R-squared")
    plt.grid(True)

    plt.tight_layout()
    plt.show()


def pipe_knn(alea, y, scaler, graph, metric):

    # Séparation des jeux de données entrainement / validation, preprocessing
    X_train, X_test, y_train, y_test, preprocessor = preprocessing(y, alea=alea, test_size=test_size, \
                                                                    scaler=scaler)
    # Create a KNN Regressor
    knn_regressor = KNeighborsRegressor(metric=metric)

    # Create a pipeline with preprocessing and a knn regressor
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("knn_regressor", knn_regressor)
    ])

    # Define hyperparameters and their possible values for grid search
    param_grid = {
        'knn_regressor__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'knn_regressor__weights': ['uniform', 'distance']
    }

    # Create the GridSearchCV object with multiple scoring metrics
    scoring = {'neg_mean_squared_error': 'neg_mean_squared_error', 'r2': 'r2'}
    grid_search = GridSearchCV(pipe, param_grid=param_grid,
                            scoring=scoring, cv=5, refit='r2', verbose=1)

    # Fit the GridSearchCV object to your training data to perform hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Access the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    # Create the KNN regressor with the best hyperparameters
    best_knn_regressor = KNeighborsRegressor(metric=metric,
                                             n_neighbors=best_params['knn_regressor__n_neighbors'],
                                             weights=best_params['knn_regressor__weights'])

    # Create a pipeline with the preprocessor and the tuned knn regressor
    pipeline_with_tuned_knn = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("knn_regressor", best_knn_regressor)  # Use the tuned neighbor and weight values here
    ])

    # Perform cross-validation (on training set) and display the scores for each split
    scoring = ['r2', 'neg_mean_squared_error']
    cv_scores = cross_validate(pipeline_with_tuned_knn, X_train, y_train, cv=5, scoring=scoring)
    # print("Cross-Validation Scores (training):", '\n', cv_scores)
    print("Cross-Validation Scores:")
    for i, score in enumerate(cv_scores['test_r2']):
        print(f"Split {i+1} : r2 = {score}")

    r2_val = cv_scores['test_r2'].mean()
    mse_val = -cv_scores['test_neg_mean_squared_error'].mean()
    rmse_val = np.sqrt(mse_val)

    # fit model on training set
    time_fit = fit_and_timeit(pipeline_with_tuned_knn, X_train, y_train)
    # Make predictions
    y_pred, time_predict = predict_and_timeit(pipeline_with_tuned_knn, X_test)

    # Calculate scores on training
    r2_train, rmse_train = calcul_scores(y_train, pipeline_with_tuned_knn.predict(X_train))
    # and testing set
    r2_test, rmse_test = calcul_scores(y_test, y_pred)

    # Display results
    print(f"R-squared (val) =  {r2_val}")
    print(f"R-squared (train) =  {r2_train}")
    print(f"R-squared (test) =  {r2_test}")
    print(f"RMSE (val) =  {rmse_val}")
    print(f"RMSE (train) =  {rmse_train}")
    print(f"RMSE (test) =  {rmse_test}" '\n')

    # display results/error as graph on first iteration (if asked to)
    if alea == 0 and graph:
        plot_performance_vs_neighbors(grid_search)
        plot_predictions(r2_train, r2_test, y_pred, y_test, kind='actual_vs_predicted', y=y)
        # plot_predictions(r2_train, r2_test, y_pred, y_test, kind='residual_vs_predicted', y=y)

    # Return scores for this random state
    return r2_val, rmse_val, r2_train, rmse_train, r2_test, rmse_test, time_fit, time_predict


## Deep learning Models
