## librairies


In [None]:
# from common_imports import * ### ###

import os, sys, random
import ast
import numpy as np
import pandas as pd
from typing import Tuple
from pandarallel import pandarallel
from pprint import pprint
import json
import requests
from collections import Counter

# Visualisation
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from PIL import Image

# NLP
from bs4 import BeautifulSoup
import re, string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

#
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim import similarities
from gensim.models.ldamulticore import LdaMulticore

#
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import make_scorer, PredictionErrorDisplay, r2_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor


print('\nPython version ' + sys.version)
print('pyLDAvis version ' + pyLDAvis.__version__)

# Modify if necessary
num_cores = os.cpu_count()
print(f"\nNumber of CPU cores: {num_cores}")
pandarallel.initialize(progress_bar=False, nb_workers=6)

#
import mlflow
from mlflow import MlflowClient
import mlflow.pyfunc
from mlflow.models.signature import infer_signature, ModelSignature #, Schema, ParamSchema
from mlflow.types import Schema, ParamSchema, ParamSpec, ColSpec

# os.environ['MLFLOW_TRACKING_URI'] = './'

# ! REQUIRES CONSOLE COMMAND : mlflow ui
# depuis dossier notebooks
# at least once, to creat mlruns folder

# Utilisable seulement en local...
mlflow.set_tracking_uri("http://localhost:5000")
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")


### fonctions


In [None]:
def create_mlflow_experiment(
    experiment_name: str, artifact_location: str, tags: dict[str, str]
) -> str:
    """
    Create a new mlflow experiment with the given name and artifact location.

    Parameters:
    ----------
    experiment_name: str
        The name of the experiment to create.
    artifact_location: str
        The artifact location of the experiment to create.
    tags: dict[str,Any]
        The tags of the experiment to create.

    Returns:
    -------
    experiment_id: str
        The id of the created experiment.
    """
    try:
        experiment_id = mlflow.create_experiment(
            name=experiment_name, artifact_location=artifact_location, tags=tags
        )
    except:
        print(f"Experiment {experiment_name} already exists.")
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

    mlflow.set_experiment(experiment_name=experiment_name)

    return experiment_id


def get_mlflow_experiment(
    experiment_id: str = None, experiment_name: str = None
) -> mlflow.entities.Experiment:
    """
    Retrieve the mlflow experiment with the given id or name.

    Parameters:
    ----------
    experiment_id: str
        The id of the experiment to retrieve.
    experiment_name: str
        The name of the experiment to retrieve.

    Returns:
    -------
    experiment: mlflow.entities.Experiment
        The mlflow experiment with the given id or name.
    """
    if experiment_id is not None:
        experiment = mlflow.get_experiment(experiment_id)
    elif experiment_name is not None:
        experiment = mlflow.get_experiment_by_name(experiment_name)
    else:
        raise ValueError("Either experiment_id or experiment_name must be provided.")

    return experiment


def turn_str_back_into_list(df):
    """Correct the type change due to .csv export"""

    df['title_nltk'] = df['title_nltk'].apply(ast.literal_eval)
    df['body_nltk'] = df['body_nltk'].apply(ast.literal_eval)
    df['title_spacy'] = df['title_spacy'].apply(ast.literal_eval)
    df['body_spacy'] = df['body_spacy'].apply(ast.literal_eval)
    df['all_tags'] = df['all_tags'].apply(ast.literal_eval)


def token_list_into_bow(X):
    documents = X.tolist()
    # print(documents)
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    bow_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    return gensim_dictionary, bow_matrix


def preprocess_text(text):
    #Cleaning
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = text.lower().strip()

    # Tokenization
    try:
        tokens = nltk.word_tokenize(text)
        tokenizer = nltk.RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(" ".join(tokens))  # Apply RegexpTokenizer to the entire list

        # Remove punctuation (make sure, RegexpTokenizer should have done it already)
        tokens = [token for token in tokens if token not in string.punctuation]

    except Exception as e:
        print(f"Error in tokenization: {e}")
        return []

    # Lemmatization
    lemmatizer = WordNetLemmatizer()

    # Get part of speech for each token
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = []

    for token, pos_tag in pos_tags:
        # ! Uncommenting next line may crash the cell
        # print(f"Token: {token}, POS Tag: {pos_tag}")
        if pos_tag.startswith('V'):
            # On garde
            lemmatized_tokens.append(lemmatizer.lemmatize(token, pos='v'))
            # Returns the input word unchanged if it cannot be found in WordNet.
        elif pos_tag.startswith('N'):
            # On garde
            try:
                lemmatized_tokens.append(lemmatizer.lemmatize(token, pos='n'))
            except Exception as e:
                print(f"Error lemmatizing verb {token}: {e}")
        # Sinon on supprime

    # Read forbidden words (stopwords, too frequent, too rare) from the file
    with open('./../0_data/cleaned_data/forbidden_words.txt', 'r') as file:
        forbidden = [line.strip() for line in file]

    filtered_list = [token for token in lemmatized_tokens if token not in forbidden]

    # keep uniques
    seen_tokens = set()
    unique_tokens = []

    for token in filtered_list:
        if token not in seen_tokens:
            seen_tokens.add(token)
            if len(token) > 2:
                unique_tokens.append(token)

    return unique_tokens


### import


In [None]:
# Again, this needs mlfow ui console command first -> unusable on remote server
# all_experiments = client.search_experiments()
# pprint(all_experiments)

train = pd.read_csv('./../0_data/cleaned_data/train_bow_uniques.csv', sep=',')
test = pd.read_csv('./../0_data/cleaned_data/test_bow_uniques.csv', sep=',')

turn_str_back_into_list(train)
turn_str_back_into_list(test)


### scores


In [None]:
# add scorers : precision, r2, jaccard

def precision_topics(real_tags:list, predicted_tags:list): # pour comparer 2 listes
    # precision = TP / (TP + FP)
    tp = 0
    for predicted_tag in predicted_tags:
        if predicted_tag in real_tags:
            tp += 1

    fp = len(predicted_tags) - tp
    precision = tp/(tp + fp)
    # <=> precision = tp/len(predicted_tags)

    return precision

def precision_score(y_true, y_pred): # pour comparer 2 df ou 2 matrices de mm shape[0]
    precision = 0
    for i in range(0, len(y_pred)):
        precision += precision_topics(y_true[i], y_pred[i])
    precision_moyenne = precision / len(y_pred)

    return precision_moyenne

# pour la gridsearchcv
# custom_precision_scorer = make_scorer(precision_score, greater_is_better=True)


def jaccard_similarity(topic1, topic2):
    set1 = set(topic1)
    set2 = set(topic2)
    return len(set1.intersection(set2)) / len(set1.union(set2))

def jaccard_score(y_true, y_pred): # pour comparer 2 df ou 2 matrices de mm shape[0]
    jacc = 0
    for i in range(0, len(y_pred)):
        jacc += jaccard_similarity(y_true[i], y_pred[i])
    jacc_moyen = jacc / len(y_pred)

    return jacc_moyen

# pour la gridsearchcv
# custom_jacc_scorer = make_scorer(jaccard_score, greater_is_better=True)


## custom model


### test


In [None]:
class AddN(mlflow.pyfunc.PythonModel):
    """
    A custom model that adds a specified value `n` to all columns of the input DataFrame.

    Attributes:
    -----------
    n : int
        The value to add to input columns.
    """

    def __init__(self, n):
        """
        Constructor method. Initializes the model with the specified value `n`.

        Parameters:
        -----------
        n : int
            The value to add to input columns.
        """
        self.n = n

    def predict(self, context, model_input, params=None):
        """
        Prediction method for the custom model.

        Parameters:
        -----------
        context : Any
            Ignored in this example. It's a placeholder for additional data or utility methods.

        model_input : pd.DataFrame
            The input DataFrame to which `n` should be added.

        params : dict, optional
            Additional prediction parameters. Ignored in this example.

        Returns:
        --------
        pd.DataFrame
            The input DataFrame with `n` added to all columns.
        """
        return model_input.apply(lambda column: column + self.n)


# Instantiate the model with a specific value of n
add_n_model = AddN(n=5)

# Create a sample input DataFrame
input_data = pd.DataFrame({
    'column1': [1, 2, 3],
    'column2': [4, 5, 6]
})

# Make predictions using the predict method
predictions = add_n_model.predict(context=None, model_input=input_data, params=None)

# Display the predictions
print(predictions)


### knn


In [None]:
class SpecialKnn(mlflow.pyfunc.PythonModel):
    """A special model """

    def __init__(self, k, n=5):
        """
        Constructor method. Initializes the model with the specified value `n`.

        Parameters:
        -----------
        k : int
        """
        self.k = k # nb voisins, shortcut pour l'attribut .n_neighbors
        self.n = n # nb tags predits
        self.knn = KNeighborsRegressor(n_neighbors=k)
        self.dict_X = Dictionary()
        self.dict_y = Dictionary()

    def load_context(self, context):
        # when instance is created
        # on l'utilisera + tard, pour recup rapidement de "gros" modeles deja entraines
        pass


    def fit(self, train_df, feature, target):
        X_train = train_df[feature].values
        y_train = train_df[target].values

        self.dict_X, X_bow_matrix = token_list_into_bow(X_train)
        self.dict_y, y_bow_matrix = token_list_into_bow(y_train)

        # Create a KNN Regressor
        self.knn.fit(X_bow_matrix, y_bow_matrix)


    def predict_tokens(self, input_text, train_df=train, target='all_tags'):
        """Prediction method for the custom model."""
        # Example query
        query_tokens = preprocess_text(input_text)
        print(query_tokens)
        query_bow = self.dict_X.doc2bow(query_tokens)
        query_vector = corpus2dense([query_bow], num_terms=len(self.dict)).T

        # Find nearest neighbors
        _, indices = self.knn.kneighbors(query_vector)

        # Aggregate tags from neighbors
        neighbor_tags = [tag for i in indices.flatten() for tag in train_df.iloc[i][target]]

        # Predict tags based on most common tags among neighbors
        predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common(n=5)]
        # 5 tags/question en moyenne mais on peut suggérer +
        # ici a ameliorer

        return predicted_tags

    # scores



# Instantiate the model with a specific value of n
my_knn = SpecialKnn(k=50)

my_knn.fit(train_df=train, feature='title_nltk', target='all_tags')


In [None]:
# my_knn.predict_tokens('javascript')
# my_knn.predict_tokens('python')

# print(str(test['title_nltk'][0]))
# my_knn.predict_tokens(str(test['title_nltk'][0]))

my_knn.predict_tokens('find class com google firebase provider')

# perfect


In [None]:
# test sans le preprocessor
# obligé de transformer la target mm si on ne s'sn sert pas vraiment, car grid_search.fit()
# n'accepte que des valeurs numériques.
# du coup on peut utiliser des metriques classiques pour le score (ici r2),
# mais ca n'a aucun sens metier interpretable

# ici convertir les tags en bag of words ou les one hot encoder revient exactement au meme, donc
# autant utiliser le bow, on a deja le transformer.

# ca prend trop de ressources ! Il est tps d'utiliser les nested runs de mlflow

def pipe_knn(train_df=train, feature='title_nltk', target='all_tags', test_df=test, input=['']):
    # Load your training data and labels
    X_train = train_df[feature].values
    y_train = train_df[target].values

    X_bow_matrix = token_list_into_bow(X_train)
    y_bow_matrix = token_list_into_bow(y_train)

    # Create a KNN Regressor
    knn_regressor = KNeighborsRegressor()

    # Create a pipeline with preprocessing and a knn regressor, to simplify gridsearch
    pipe = Pipeline(steps=[
        ("knn_regressor", knn_regressor)
    ])

    # Define hyperparameters and their possible values for grid search
    param_grid = {
        'knn_regressor__n_neighbors': [1],
        'knn_regressor__weights': ['uniform'] # , 'distance'
    }

    # Create the GridSearchCV object with multiple scoring metrics
    # scoring = {'neg_mean_squared_error': 'neg_mean_squared_error', 'r2': 'r2'}
    grid_search = GridSearchCV(pipe, param_grid=param_grid,
                            scoring='r2', cv=5, verbose=1) # add, refit='precision' for multiple scoring

    # Fit the GridSearchCV object to your training data to perform hyperparameter tuning
    grid_search.fit(X_bow_matrix, y_bow_matrix)

    # Access the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    # Create the KNN regressor with the best hyperparameters
    best_knn_regressor = KNeighborsRegressor(# metric=metric,
                                             n_neighbors=best_params['knn_regressor__n_neighbors'],
                                             weights=best_params['knn_regressor__weights'])

    # Create a pipeline with the preprocessor and the tuned knn regressor
    pipeline_with_tuned_knn = Pipeline(steps=[
        ("knn_regressor", best_knn_regressor)  # Use the tuned neighbor and weight values here
    ])

    # Perform cross-validation (on training set) and display the scores for each split
    # scoring = ['r2', 'neg_mean_squared_error']
    cv_scores = cross_validate(pipeline_with_tuned_knn, X_bow_matrix, y_bow_matrix, cv=5, scoring='r2')
    # print("Cross-Validation Scores (training):", '\n', cv_scores)
    print("Cross-Validation Scores:")
    pprint(cv_scores)
    for i, score in enumerate(cv_scores['test_score']):
        print(f"Split {i+1} : precision = {score}")

    return best_knn_regressor.predict(input)


# pipe_knn()


## mlflow runs


i was wondering : isn't optimising the number of neighbors hyperparameter of a knn via gridsearhcv a bad idea ? because the gridsearch will use folds to train the models. but folds are smaller subsets, and the optimal number of neighbor for a knn depends on the size of the training dataset.
ChatGPT
You raise a valid point. When optimizing hyperparameters for a k-Nearest Neighbors (KNN) model using cross-validation, there are considerations regarding the choice of the number of neighbors (n_neighbors) and the impact of the size of the training dataset on this hyperparameter.

In KNN, the optimal number of neighbors can indeed depend on the characteristics of the training data. If you have a small training dataset, a small number of neighbors might be sufficient, while a larger dataset might benefit from a larger number of neighbors.


In [None]:
# tester hyperopt ?


In [None]:
experiment_id = create_mlflow_experiment(
    experiment_name="knn_optimisation_neighbors",
    artifact_location="./artifacts",
    tags={"modele": "knn", "feature": "title", 'nlp': 'nltk'},
)


In [None]:
experiment = get_mlflow_experiment(experiment_id=experiment_id)
print("Name: {}".format(experiment.name))

with mlflow.start_run(run_name="testing", experiment_id=experiment_id) as run:

    # log model using autolog crashes the notebook
    # mlflow.autolog()
    # mlflow.sklearn.autolog()

    pipe_knn(train_df=train[:100])

    # log model
    mlflow.sklearn.log_model(sk_model=SpecialKnn(), artifact_path="knn")
    # mlflow.pyfunc.log_model(artifact_path="knn", python_model=KNeighborsRegressor())

    # print run info
    print("run_id: {}".format(run.info.run_id))
    print("experiment_id: {}".format(run.info.experiment_id))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("status: {}".format(run.info.status))
    print("start_time: {}".format(run.info.start_time))
    print("end_time: {}".format(run.info.end_time))
    # print("lifecycle_stage: {}".format(run.info.lifecycle_stage)) # deprecated, use alias or tags


# J'esperais qu'mlflow allait nous permettre de contourner le probleme de
# l'entrainement du modele, qui demande bcp d'espace memoire.
# probleme : mm sans l'ui, le tracking/logging mlflow consomment enormement !
# la solution a l'air pire que le probleme...
