## **Catégorisez automatiquement des questions**

### partie 4/8 : Prédiction de tags, approche supervisée + tracking mlflow

#### <br> Notebook d’exploration et de pré-traitement des questions, comprenant une analyse univariée et multivariée, un nettoyage des questions, un feature engineering de type bag of words avec réduction de dimension (du vocabulaire et des tags) 

<br>


## 1 Import


### 1.1 Librairies


In [26]:
import os, sys, random
import ast
import numpy as np
import pandas as pd
from typing import Tuple
from pandarallel import pandarallel
from pprint import pprint
from collections import Counter

# Visualisation
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image

# NLP
from bs4 import BeautifulSoup
import re, string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# gensim
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim import similarities
from gensim.models.ldamulticore import LdaMulticore

# sklearn
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import make_scorer, PredictionErrorDisplay, r2_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

#tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# mlflow
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import infer_signature, ModelSignature #, Schema, ParamSchema
from mlflow.types import Schema, ParamSchema, ParamSpec, ColSpec

# ! REQUIRES CONSOLE COMMAND : mlflow ui
# depuis dossier notebooks
os.environ['MLFLOW_TRACKING_URI'] = './'

# Utilisable seulement en local...
mlflow.set_tracking_uri("http://localhost:5000")
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")

# verif, reglages
print('\nPython version ' + sys.version)
print('pyLDAvis version ' + pyLDAvis.__version__)

# Modify if necessary
num_cores = os.cpu_count()
print(f"\nNumber of CPU cores: {num_cores}")
pandarallel.initialize(progress_bar=False, nb_workers=6)



Python version 3.11.5 (main, Sep 11 2023, 13:23:44) [GCC 11.2.0]
pyLDAvis version 3.4.0

Number of CPU cores: 8
INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### 1.2 Functions


In [27]:
# dataframe manipulation, NLP

def get_missing_values(df):
    """Generates a DataFrame containing the count and proportion of missing values for each feature.

    Args:
        df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
        pandas.DataFrame: A DataFrame with columns for the feature name, count of missing values,
        count of non-missing values, proportion of missing values, and data type for each feature.
    """
    # Count the missing values for each column
    missing = df.isna().sum()

    # Calculate the percentage of missing values
    percent_missing = df.isna().mean() * 100

    # Create a DataFrame to store the results
    missings_df = pd.DataFrame({
        'column_name': df.columns,
        'missing': missing,
        'present': df.shape[0] - missing,  # Count of non-missing values
        'percent_missing': percent_missing.round(2),  # Rounded to 2 decimal places
        'type': df.dtypes
    })

    # Sort the DataFrame by the count of missing values
    missings_df.sort_values('missing', inplace=True)

    return missings_df

# with pd.option_context('display.max_rows', 1000):
#   display(get_missing_values(df))


def quick_look(df, miss=True):
    """
    Display a quick overview of a DataFrame, including shape, head, tail, unique values, and duplicates.

    Args:
        df (pandas.DataFrame): The input DataFrame to inspect.
        check_missing (bool, optional): Whether to check and display missing values (default is True).

    The function provides a summary of the DataFrame, including its shape, the first and last rows, the count of unique values per column, and the number of duplicates.
    If `check_missing` is set to True, it also displays missing value information.
    """
    print(f'shape : {df.shape}')

    display(df.head())
    display(df.tail())

    print('uniques :')
    display(df.nunique())

    print('Doublons ? ', df.duplicated(keep='first').sum(), '\n')

    if miss:
        display(get_missing_values(df))


def preprocess_text(text):
    #Cleaning
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = text.lower().strip()

    # Tokenization
    try:
        tokens = nltk.word_tokenize(text)
        tokenizer = nltk.RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(" ".join(tokens))  # Apply RegexpTokenizer to the entire list

        # Remove punctuation (make sure, RegexpTokenizer should have done it already)
        tokens = [token for token in tokens if token not in string.punctuation]

    except Exception as e:
        print(f"Error in tokenization: {e}")
        return []

    # Lemmatization
    lemmatizer = WordNetLemmatizer()

    # Get part of speech for each token
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = []

    for token, pos_tag in pos_tags:
        # ! Uncommenting next line may crash the cell
        # print(f"Token: {token}, POS Tag: {pos_tag}")
        if pos_tag.startswith('V'):
            # On garde
            lemmatized_tokens.append(lemmatizer.lemmatize(token, pos='v'))
            # Returns the input word unchanged if it cannot be found in WordNet.
        elif pos_tag.startswith('N'):
            # On garde
            try:
                lemmatized_tokens.append(lemmatizer.lemmatize(token, pos='n'))
            except Exception as e:
                print(f"Error lemmatizing verb {token}: {e}")
        # Sinon on supprime

    # Read forbidden words (stopwords, too frequent, too rare) from the file
    with open('./forbidden_words.txt', 'r') as file:
        forbidden = [line.strip() for line in file]

    filtered_list = [token for token in lemmatized_tokens if token not in forbidden]

    # keep uniques
    seen_tokens = set()
    unique_tokens = []

    for token in filtered_list:
        if token not in seen_tokens:
            seen_tokens.add(token)
            if len(token) > 2:
                unique_tokens.append(token)

    return unique_tokens


def turn_str_back_into_list(df):
    """Correct the type change due to .csv export"""

    df['title_nltk'] = df['title_nltk'].apply(ast.literal_eval)
    df['body_nltk'] = df['body_nltk'].apply(ast.literal_eval)
    df['title_spacy'] = df['title_spacy'].apply(ast.literal_eval)
    df['body_spacy'] = df['body_spacy'].apply(ast.literal_eval)
    df['all_tags'] = df['all_tags'].apply(ast.literal_eval)



In [28]:
# mlflow

def create_mlflow_experiment(
    experiment_name: str, artifact_location: str, tags: dict[str, str]
) -> str:
    """
    Create a new mlflow experiment with the given name and artifact location.

    Parameters:
    ----------
    experiment_name: str
        The name of the experiment to create.
    artifact_location: str
        The artifact location of the experiment to create.
    tags: dict[str,Any]
        The tags of the experiment to create.

    Returns:
    -------
    experiment_id: str
        The id of the created experiment.
    """
    try:
        experiment_id = mlflow.create_experiment(
            name=experiment_name, artifact_location=artifact_location, tags=tags
        )
    except:
        print(f"Experiment {experiment_name} already exists.")
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

    mlflow.set_experiment(experiment_name=experiment_name)

    return experiment_id


def get_mlflow_experiment(
    experiment_id: str = None, experiment_name: str = None
) -> mlflow.entities.Experiment:
    """
    Retrieve the mlflow experiment with the given id or name.

    Parameters:
    ----------
    experiment_id: str
        The id of the experiment to retrieve.
    experiment_name: str
        The name of the experiment to retrieve.

    Returns:
    -------
    experiment: mlflow.entities.Experiment
        The mlflow experiment with the given id or name.
    """
    if experiment_id is not None:
        experiment = mlflow.get_experiment(experiment_id)
    elif experiment_name is not None:
        experiment = mlflow.get_experiment_by_name(experiment_name)
    else:
        raise ValueError("Either experiment_id or experiment_name must be provided.")

    return experiment


In [29]:
# ! This needs mlfow ui console command first -> unusable on remote server
# all_experiments = client.search_experiments()
# pprint(all_experiments)


### 1.3 Data


In [30]:
raw_data = pd.read_csv('./../0_data/cleaned_data/bow_classic.csv', sep=',')

turn_str_back_into_list(raw_data)

display(raw_data.tail())

print(raw_data.shape)

feature = 'title_nltk'


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
47660,2011-05-23 22:22:56,How can I send a file document to the printer ...,Here's the basic premise:\nMy user clicks some...,"[c#, winforms, pdf, .net-4.0, printing]","[send, file, document, printer, print]","[send, file, document, printer, print, premise...","[send, file, document, printer, print]","[premise, user, click, file, spit, desktop, wa..."
47661,2011-05-23 21:15:51,CA1014 Mark 'some.dll' with CLSCompliant(true)...,"When I run StyleCop, I got this error message ...","[visual-studio, visual-studio-2010, dll, style...","[mark, dll, error, message, vs2010]","[mark, dll, error, message, vs2010, run, get, ...","[error, message]","[run, error, message, need, mark, dll, set, dl..."
47662,2011-05-23 21:05:59,How to change a text file's name in C++?,"I would like to change a txt file's name, but ...","[c++, algorithm, file, directory, file-rename]","[change, text, file, name, c]","[change, text, file, name, c, like, change, tx...","[change, text, file]","[like, change, txt, file, find, example, want,..."
47663,2011-05-23 20:06:35,php implode (101) with quotes,Imploding a simple array \nwould look like th...,"[php, arrays, string, csv, implode]","[php, quote]","[php, quote, array, look, array, array, lastna...",[quote],"[implode, array, look, array, email, phone, ar..."
47664,2011-05-23 20:00:57,What characters are allowed in a iOS file name?,I'm looking for a way to make sure a string ca...,"[ios, file, filenames, character-encoding, nsf...","[character, allow, file, name]","[character, allow, file, name, look, way, make...","[character, allow, file]","[look, way, string, file, section, code, delet..."


(47665, 8)


### 1.4 Checkpoint : sample + ttsplit


In [31]:
# pour parcourir rapidement la premiere partie du ntbk
quick_df = raw_data[::1]

# ! éviter > 200 (max),
# sinon le testing set devient trop petit pour les prédictions

train_df, test_df = train_test_split(quick_df, test_size=0.1, random_state=42)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(train_df.shape)
print(test_df.shape)


(42898, 8)
(4767, 8)


## 2 KNN


In [32]:
# Often gives good results if enough data
# relatively fast to train
# Accepts basically any input, as long as it is numerical

# => Perfect for testing different embeddings !


### 2.1 Dummy knn : il copie sur le + proche voisin


In [33]:
# Notre baseline
# Pour ce premier modèle je voulais quelque chose de simple, pour pouvoir tester les scores.
# Donc on n'utilise pas la méthode .predict() du knn, seulement la méthode .kneighbors(),
# disponible après l'entrainement (=le fit). Enuite on va regarder et return les n tags les + frequents
# parmi les tags des voisins (ici, 1 seul voisin, le + proche).


def predict_tags_using_dummy_knn(df, feature=feature, target='all_tags', k=1, exemple=None):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]
    print(len(corpus))

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T
    print(dense_matrix.shape, '\n')

    # Ici on ne va pas demander au knn de faire de prediction,
    # On veut juste qu'il trouve les voisins.
    # Mais la fonction fit a besoin de targets en param
    target_values = df[target].values

    # Initialize kNN model
    knn_model = KNeighborsRegressor(n_neighbors=k, metric='cosine', algorithm='brute')
    # print(knn_model.n_neighbors)

    knn_model.fit(dense_matrix, target_values)

    # Example query
    query_document = exemple
    query_bow = gensim_dictionary.doc2bow(query_document)
    query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

    # Find nearest neighbors
    _, indices = knn_model.kneighbors(query_vector)

    # Aggregate tags from neighbors
    neighbor_tags = [tag for i in indices.flatten() for tag in df.iloc[i][target]]

    print(neighbor_tags)

    # Predict tags based on most common tags among neighbors
    predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common(n=10)]
    # 5 tags/question en moyenne mais on peut suggérer +

    return predicted_tags, knn_model


exemple = ["your", 'text', 'document', 'javascript']
print(exemple, '\n')

predicted_tags, knn_test = predict_tags_using_dummy_knn(df=train_df, exemple=exemple)
print(predicted_tags, '\n')

# javascript ok


['your', 'text', 'document', 'javascript'] 

42898
(42898, 6128) 

['c#', '.net', 'ms-word', 'openxml', 'openxml-sdk']
['c#', '.net', 'ms-word', 'openxml', 'openxml-sdk'] 



### 2.2 knn basic


In [34]:
# Mm principe mais on peut choisir le nombre de voisins (-> paramètre à optimiser)
# add score


def predict_tags_using_knn(df, feature=feature, target='all_tags', k=50, exemple=None):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    # Ici on ne va pas demander au knn de faire de prediction,
    # On veut juste qu'il trouve les voisins.
    # Mais la fonction fit a besoin de targets en param
    target_values = df[target].values

    # Initialize kNN model
    knn_model = KNeighborsRegressor(n_neighbors=k, metric='cosine', algorithm='brute')
    knn_model.fit(dense_matrix, target_values)

    # Example query
    query_document = exemple
    query_bow = gensim_dictionary.doc2bow(query_document)
    query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

    # Find nearest neighbors
    _, indices = knn_model.kneighbors(query_vector)

    # Aggregate tags from neighbors
    neighbor_tags = [tag for i in indices.flatten() for tag in df.iloc[i][target]]

    print(neighbor_tags)

    # Predict tags based on most common tags among neighbors
    predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common(n=10)]
    # 5 tags/question en moyenne mais on peut suggérer +

    return query_vector, predicted_tags



### 2.3 predictions (input = list)


In [35]:
exemple1 = ["your", 'text', 'document', 'javascript']
print(exemple, '\n')

_, predicted_tags1 = predict_tags_using_knn(df=train_df, exemple=exemple1)
print(predicted_tags1, '\n') # most frequent, sorted

# javascript ok


['your', 'text', 'document', 'javascript'] 

['c#', '.net', 'ms-word', 'openxml', 'openxml-sdk', 'java', 'javascript', 'ajax', 'selenium', 'htmlunit-driver', 'javascript', 'internet-explorer', 'class', 'internet-explorer-8', 'classname', 'javascript', 'jquery', 'jquery-plugins', 'text-to-speech', 'html5-audio', 'javascript', 'html', 'css', 'text', 'truncate', 'javascript', 'html', 'string', 'text', 'extract', 'javascript', 'jquery', 'css', 'dom', 'document', 'javascript', 'dom', 'substring', 'indexof', 'getselection', 'python', 'text', 'replace', 'ms-word', 'python-docx', 'javascript', 'php', 'jquery', 'curl', 'http-headers', 'javascript', 'html', 'function', 'text', 'onclick', 'python', 'module', 'preprocessor', 'nlp', 'stemming', 'android', 'material-design', 'android-support-library', 'android-textinputlayout', 'android-support-design', 'c#', '.net', 'html', 'pdf', 'extract', 'html', 'url', 'browser', 'full-text-search', 'highlight', 'javascript', 'jquery', 'ruby-on-rails', 'tdd', '

In [36]:
exemple2 = ["your", 'text', 'document', 'python']
print(exemple2, '\n')

_, predicted_tags2 = predict_tags_using_knn(df=train_df, exemple=exemple2)
print(predicted_tags2, '\n')

# python


['your', 'text', 'document', 'python'] 

['python', 'module', 'preprocessor', 'nlp', 'stemming', 'c#', '.net', 'ms-word', 'openxml', 'openxml-sdk', 'python-2.7', 'ubuntu', 'python-3.x', 'spatial-index', 'r-tree', 'python-3.x', 'pdf', 'text', 'extract', 'pdfminer', 'python', 'text', 'stemming', 'plural', 'singular', 'python', 'html', 'web-scraping', 'text', 'beautifulsoup', 'python', 'selenium', 'selenium-webdriver', 'xpath', 'webdriverwait', 'python', 'plot', 'tree', 'data-visualization', 'visualization', 'python', 'pdf', 'python-3.7', 'pypdf', 'pdf-extraction', 'python', 'python-3.x', 'annotations', 'lint', 'type-hinting', 'python', 'python-2.7', 'reflection', 'delegation', 'message-passing', 'python', 'documentation', 'python-3.7', 'docstring', 'python-dataclasses', 'python', 'python-3.x', 'algorithm', 'sorting', 'mergesort', 'python', 'parsing', 'text', 'file-io', 'python-2.7', 'python', 'image', 'opencv', 'image-processing', 'computer-vision', 'python', 'utf-8', 'python-unicode', '

In [37]:
exemple = test_df[feature][0]
print(exemple, '\n')

# Call the function with your DataFrame and the desired text feature and target tags
_, predicted_tags = predict_tags_using_knn(df=train_df, exemple=exemple)
print(predicted_tags, '\n')

# firebase peut etre predit, (ou java selon taille datast test)
# grand succes !


['find', 'class', 'com', 'google', 'firebase', 'provider'] 

['java', 'spring', 'rest', 'gradle', 'spring-boot', 'android', 'android-studio', 'firebase', 'android-gradle-plugin', 'google-play-services', 'java', 'android', 'firebase', 'gradle', 'android-gradle-plugin', 'php', 'sql', 'laravel', 'laravel-5', 'laravel-artisan', 'php', 'class', 'laravel', 'alias', 'autoloader', 'android', 'android-intent', 'arraylist', 'unmarshalling', 'parcelable', 'java', 'spring', 'spring-boot', 'spring-security', 'spring-security-oauth2', 'android', 'google-maps', 'dictionary', 'android-mapview', 'inflate', 'android', 'firebase', 'android-studio', 'android-gradle-plugin', 'jcenter', 'ios', 'objective-c', 'xcode', 'storyboard', 'xib', 'javascript', 'node.js', 'typescript', 'nestjs', 'class-validator', 'java', 'jar', 'maven-2', 'manifest', 'program-entry-point', 'javascript', 'node.js', 'angular', 'typescript', 'angular7', 'angular', 'firebase', 'npm', 'angularfire', 'angularfire2', 'c++', 'c++11', 'bit-f

### 2.4 Predictions (input = text)


In [38]:
exemple = test_df['title'][0]
print(exemple)
print(f"real tags: {test_df['all_tags'][0]}", '\n')

exemple_text = preprocess_text(exemple)

_, predicted_tags = predict_tags_using_knn(df=train_df, exemple=exemple_text)
print(predicted_tags, '\n')

# ok


Didn't find class "com.google.firebase.provider.FirebaseInitProvider"
real tags: ['java', 'android', 'android-studio', 'error-handling', 'compiler-errors'] 

['java', 'spring', 'rest', 'gradle', 'spring-boot', 'android', 'android-studio', 'firebase', 'android-gradle-plugin', 'google-play-services', 'java', 'android', 'firebase', 'gradle', 'android-gradle-plugin', 'php', 'sql', 'laravel', 'laravel-5', 'laravel-artisan', 'php', 'class', 'laravel', 'alias', 'autoloader', 'android', 'android-intent', 'arraylist', 'unmarshalling', 'parcelable', 'java', 'spring', 'spring-boot', 'spring-security', 'spring-security-oauth2', 'android', 'google-maps', 'dictionary', 'android-mapview', 'inflate', 'android', 'firebase', 'android-studio', 'android-gradle-plugin', 'jcenter', 'ios', 'objective-c', 'xcode', 'storyboard', 'xib', 'javascript', 'node.js', 'typescript', 'nestjs', 'class-validator', 'java', 'jar', 'maven-2', 'manifest', 'program-entry-point', 'javascript', 'node.js', 'angular', 'typescript'

In [39]:
exemple = test_df['title'][1]
print(exemple)
print(f"real tags: {test_df['all_tags'][1]}", '\n')

exemple_text = preprocess_text(exemple)

_, predicted_tags = predict_tags_using_knn(df=train_df, exemple=exemple_text)
print(predicted_tags, '\n')

# scala ok


Why do I get `java.lang.NoClassDefFoundError: scala/Function1` when I run my code in ScalaIDE?
real tags: ['java', 'scala', 'maven', 'noclassdeffounderror', 'scala-ide'] 



  text = BeautifulSoup(text, 'html.parser').get_text()


['c#', '.net', 'wpf', 'code-behind', 'itemspanel', 'reactjs', 'authentication', 'google-authentication', 'google-api-js-client', 'googleauthr', 'java', 'gradle', 'spring-boot', 'jar', 'build.gradle', 'iphone', 'ios', 'xamarin.ios', 'http-response-codes', 'nsurlconnectiondelegate', 'c++', 'python', 'ctypes', 'cython', 'boost-python', 'php', 'mysql', 'laravel', 'ubuntu', 'server', 'c', 'gcc', 'types', 'openmp', 'typeof', 'scala', 'maven', 'apache-spark', 'noclassdeffounderror', 'spark-streaming', 'c', 'assembly', 'x86', 'x86-64', 'shellcode', 'python', 'selenium', 'http', 'selenium-webdriver', 'ui-automation', 'stack-trace', 'glibc', 'sigabrt', 'segmentation-fault', 'backtrace', 'c++', 'c', 'cuda', 'parallel-processing', 'gpu', 'macos', 'shell', 'scala', 'terminal', 'installation', 'c#', 'sql-server', 'asp.net-mvc', 'entity-framework', 'asp.net-mvc-4', 'java', 'selenium', 'testing', 'selenium-webdriver', 'selenium-chromedriver', 'javascript', 'html', 'ajax', 'url', 'xmlhttprequest', 'sca

### 2.5 scores


In [40]:
# Notre objectif de prédiction de tags ressemble a un pb de classification multi-label,
# où la matrice de confusion est extrêmement déséquilibrée :
# 5 tags sont prédits positifs, contre environ 250 000 tags (si on travaille sur all_tags)
# predits negatifs. Autrement dit :

# On peut utiliser la precision pour évaluer notre modèle. C'est même exactement l'outil qu'il nous faut :
# "précision = la proportion de prédictions correctes parmi les points que l’on a prédits positifs."
# En + c de loin le plus léger en ressources, puisqu'il ne s'occupe que des 5 tags prédits.

# En revanche je pense que le recall aura bcp moins de variance ici, il sera "écrasé" par
# le nombre de tags predits negatifs, sa valeur sera tjs très proche de zero.
# (même remarque pour la spécificité et l'accuracy)
# Et sans recall, pas de f1 score.
# à vérifier ?


#### Precision


In [41]:
# pourquoi on ne peut pas utiliser le score precision sckikit ici :
from sklearn.metrics import precision_score as p_score

# Assuming y_true is the ground truth (real tags) and y_pred is the predicted tags
precision = p_score(['ok', 'ko'], ['ko', 'ok'], average='micro')  # You can use 'micro', 'macro', or 'weighted' depending on your use case
print(f'Precision: {precision}')


Precision: 0.0


In [42]:
# Il nous faut un score qui ne tient pas compte de l'ordre (ds lequel les tags st prédits)
# Je ne savais pas en codant ces fonctions, mais par convention l'ordre des paramètres
# dans un scorer (en tt cas pour sklearn) est plutôt l'inverse : y_true, y_pred
# Corrigé pour précision, qu'on utilisera par la suite.
# Jaccard étant symétrique, pas indispensable / urgent de modifier


def precision_topics(predicted_tags:list, real_tags:list): # pour comparer 2 listes
    if len(predicted_tags) > 0:
        # precision = TP / (TP + FP)
        tp = 0
        for predicted_tag in predicted_tags:
            if predicted_tag in real_tags:
                tp += 1

        fp = len(predicted_tags) - tp
        precision = tp/(tp + fp)
        # <=> precision = tp/len(predicted_tags)

    else : # 0 prediction
        if len(real_tags) > 0: # On n'a pas de prédiction alors quil y avait des tags à prédire
            precision = 0
        else: # Pas de prédiction, mais pas de target
            precision = 1

    return precision


print(precision_topics(predicted_tags1, exemple1))
precision_topics(predicted_tags2, exemple2)

# ok


0.2


0.2

In [43]:
def precision_score(y_pred, y_true): # pour comparer 2 df ou 2 matrices de mm shape[0]
    precision = 0
    # print("Shapes of y_pred and y_true:", y_pred.shape, y_true.shape)  # Add this line for debugging
    for i in range(0, len(y_pred)):
        precision += precision_topics(y_pred[i], y_true[i])
    precision_moyenne = precision / len(y_pred)

    return precision_moyenne


# pour la gridsearchcv
custom_precision_scorer = make_scorer(precision_score, greater_is_better=True)


#### Recall


In [44]:
# La precision nous interesse davantage ici, pour au moins 2 raisons :
# 1) On n'a aucune raison de "pénaliser" le modele pour les faux negatifs
# 2) Ici le recall va etre tres proche de zero, et sa variance sera tres faible
# => bcp - parlant

# Mais au cas où :


def recall_topics(all_tags: list, predicted_tags: list):
    # recall = TP / (TP + FN)
    tp = 0
    fn = 0
    for real_tag in all_tags:
        if real_tag in predicted_tags:
            tp += 1
        else:
            fn += 1
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    return recall


# insister ds la doctype : y_all = all tags ici, != y_true (5-6 tags max)
def recall_score(y_all, y_pred): # pour comparer 2 df ou 2 matrices de mm shape[0]
    recall = 0
    for i in range(0, len(y_pred)):
        recall += recall_topics(y_all, y_pred[i]) # ca risque d'etre long a calculer
    recall_moyen = recall / len(y_pred)

    return recall_moyen


custom_recall_scorer = make_scorer(recall_score, greater_is_better=True)


#### f1 score


In [45]:
# mm rearques que pour le recall


def f1_topics(real_tags: list, predicted_tags: list, all_tags:list):
    precision = precision_score(real_tags, predicted_tags)
    recall = recall_topics(all_tags, predicted_tags)

    # F1 score = 2 * (precision * recall) / (precision + recall)
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1


def f1_score(y_true, y_pred, y_all):
    f1_score = 0
    for i in range(0, len(y_pred)):
        score += f1_topics(y_true[i], y_pred[i], y_all)
    score_moyen = score / len(y_pred)

    return score_moyen


custom_f1_scorer = make_scorer(f1_score, greater_is_better=True)


#### Accuracy


In [46]:
# idem, ici la variance sera presque nulle (TP entre 0 et 5, TN = environ 250 000...)


def accuracy_topics(real_tags: list, predicted_tags: list, all_tags:list):
    # accuracy = (TP + TN) / (TP + TN + FP + FN)
    tp = sum(1 for tag in predicted_tags if tag in real_tags)
    tn = sum(1 for tag in all_tags if (tag not in predicted_tags) and (tag not in real_tags) )
    fp = sum(1 for tag in predicted_tags if tag not in real_tags)
    fn = sum(1 for tag in all_tags if (tag not in predicted_tags) and (tag in real_tags))

    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0

    return accuracy


def accuracy_score(y_true, y_pred, y_all):
    score = 0
    for i in range(0, len(y_pred)):
        score += accuracy_topics(y_true[i], y_pred[i], y_all)
    score_moyen = score / len(y_pred)

    return score_moyen


custom_accuracy_scorer = make_scorer(accuracy_score, greater_is_better=True)


#### Jaccard similarity


In [47]:
# utilisé en partie 3 pour évaluer la similarité des topics obtenus / la lda
# ici pour comparer topics reels et topics predits.
# devrait etre correlé a la precision non ?
# st ts les 2 liés au nb de tp (true positifs)

# Encore mieux que la simple précision, car + stable si on prédit un nb différent de topics
# (voir + loin)


def jaccard_similarity(topic1, topic2):
    set1 = set(topic1)
    set2 = set(topic2)

    if len(set1.union(set2)):
        return len(set1.intersection(set2)) / len(set1.union(set2))

    else: # union nulle = 0 prediction, mais rien à prédire non plus...
        return 1


def jaccard_score(y_true, y_pred): # pour comparer 2 df ou 2 matrices de mm shape[0]
    jacc = 0
    for i in range(0, len(y_pred)):
        jacc += jaccard_similarity(y_true[i], y_pred[i])
    jacc_moyen = jacc / len(y_pred)

    return jacc_moyen


# pour la gridsearchcv
custom_jacc_scorer = make_scorer(jaccard_score, greater_is_better=True)


### 2.6 Evaluation


In [48]:
# sur un sample du testing set


def predict_tags_using_knn(train_df=train_df, feature=feature, target='all_tags', test_df=test_df, k=5,
                           n=5):

    # 1 PREPROCESSING
    documents = train_df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    # print(len(gensim_dictionary))
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]
    # print(len(corpus))

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T
    # print(dense_matrix.shape, '\n')

    # pas tres dense ici, c notre bow donc tres sparse en fait
    # curieux d'appeler "corpus2dense()" une fonction qui retourne une matrice sparse
    # pprint(dense_matrix[:10]) # vraiment tres dense, quasiment que des 0 ! Bref
    # print('\n')

    target_values = train_df[target].values

    # 2 MODEL TRAINING
    knn_model = KNeighborsRegressor(n_neighbors=k)
    knn_model.fit(dense_matrix, target_values)

    # 3 PREDICTION
    # Predictions completes en 1h ou 2...
    # optimiser avec pandarallel ?
    # use a sample en attendant

    predictions=[]
    min_range=0
    max_range=25 # test.shape[0]=4767
    for i in range(min_range, max_range):
        query_document = test_df[feature][i]
        print(f'doc {i} : {query_document}')
        print(f'real tags : {test_df[target][i]}')
        query_bow = gensim_dictionary.doc2bow(query_document)
        query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

        # Find nearest neighbors
        _, indices = knn_model.kneighbors(query_vector)

        # Aggregate tags from neighbors
        neighbor_tags = [tag for i in indices.flatten() for tag in train_df.iloc[i][target]]

        # Predict tags based on most common tags among neighbors
        predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common(n=n)]
        # 5 tags/question en moyenne mais on peut suggérer +
        predictions.append(predicted_tags)
        print(f'predicted : {predicted_tags}')
        print(f'precision : {precision_topics(test_df[target][i], predicted_tags)}')
        print(f'jaccard : {jaccard_similarity(test_df[target][i], predicted_tags)}', '\n')

    true_tags = [tags for tags in test_df[target][min_range:max_range]]

    # eval
    scorer = precision_score
    precision = scorer(true_tags, predictions)

    scorer = jaccard_score
    jaccard = scorer(true_tags, predictions)

    return knn_model, precision, jaccard


_, precision, jaccard = predict_tags_using_knn()

print(f'Precision = {precision}')
print(f'Jaccard similarity = {jaccard}')


# OK
# 0.34 de precision pour 5 voisins ?? c enorme !
# <=> jaccard 0.15 (/body), 0.11(title)


doc 0 : ['find', 'class', 'com', 'google', 'firebase', 'provider']
real tags : ['java', 'android', 'android-studio', 'error-handling', 'compiler-errors']
predicted : ['java', 'spring', 'rest', 'gradle', 'spring-boot']
precision : 0.2
jaccard : 0.1111111111111111 

doc 1 : ['get', 'lang', 'noclassdeffounderror', 'scala', 'run', 'code']
real tags : ['java', 'scala', 'maven', 'noclassdeffounderror', 'scala-ide']
predicted : ['c#', '.net', 'wpf', 'code-behind', 'itemspanel']
precision : 0.0
jaccard : 0.0 

doc 2 : ['django', 'bulk', 'create', 'ignore', 'duplicate']
real tags : ['python', 'mysql', 'django', 'bulkinsert', 'bulk']
predicted : ['django', 'android', 'android-layout', 'material-design', 'android-styles']
precision : 0.2
jaccard : 0.1111111111111111 

doc 3 : ['difference', 'environment', 'anaconda', 'environment']
real tags : ['python', 'pycharm', 'anaconda', 'environment', 'virtual-environment']
predicted : ['python', 'virtualenv', 'anaconda', 'conda', 'virtual-environment']
pr

### 2.7 logging mlfow


In [49]:
# Si erreur, commande console
# mlflow ui
# depuis dossier notebooks

experiment_id = create_mlflow_experiment(
    experiment_name="knn_optimisation",
    artifact_location="./artifacts",
    tags={"model": "knn", "preprocessing": 'bow', "feature": "title_nltk", 'target': 'all_tags'},
)


Experiment knn_optimisation already exists.


In [60]:
experiment = get_mlflow_experiment(experiment_id=experiment_id)
print("Name: {}".format(experiment.name))

# Define run_id variable before the with block
run_id = []

with mlflow.start_run(run_name="testing", experiment_id=experiment_id) as run:
    parameters = {
        "preprocessing": 'bow',
        "feature": 'title_nltk',
        "k neighbors": 10,
        "target": 'all_tags'
    }
    mlflow.log_params(parameters)

    knn_ref, precision_ref, jaccard_ref = predict_tags_using_knn(k=19)

    metrics = {
            "precision": precision_ref,
            "jaccard": jaccard_ref
        }

    # multiple metrics
    mlflow.log_metrics(metrics)

    # log model
    mlflow.sklearn.log_model(sk_model=knn_ref, artifact_path="special_knn")

    # log and register
    # mlflow.sklearn.log_model(sk_model=knn, artifact_path="special_knn", registered_model_name="special_knn")

    # print info about the run
    print("run_id: {}".format(run.info.run_id))
    run_id.append(run.info.run_id)
    print("experiment_id: {}".format(run.info.experiment_id))
    print("status: {}".format(run.info.status))
    print("start_time: {}".format(run.info.start_time))
    print("end_time: {}".format(run.info.end_time))
    # print("lifecycle_stage: {}".format(run.info.lifecycle_stage)) # deprecated

    print(f"precision_moyenne (debut testing set) : {precision_ref}")
    print(f"jaccard_moyen (debut testing set) : {jaccard_ref}", '\n')


print(f"precision_moyenne (debut testing set) : {precision_ref}")
print(f"jaccard_moyen (debut testing set) : {jaccard_ref}", '\n')


Name: knn_optimisation
doc 0 : ['find', 'class', 'com', 'google', 'firebase', 'provider']
real tags : ['java', 'android', 'android-studio', 'error-handling', 'compiler-errors']
predicted : ['java', 'javascript', 'php', 'laravel', 'c++']
precision : 0.2
jaccard : 0.1111111111111111 

doc 1 : ['get', 'lang', 'noclassdeffounderror', 'scala', 'run', 'code']
real tags : ['java', 'scala', 'maven', 'noclassdeffounderror', 'scala-ide']
predicted : ['c', 'java', 'python', 'c#', 'ios']
precision : 0.2
jaccard : 0.1111111111111111 

doc 2 : ['django', 'bulk', 'create', 'ignore', 'duplicate']
real tags : ['python', 'mysql', 'django', 'bulkinsert', 'bulk']
predicted : ['django', 'python', 'xcode', 'ios', 'swift']
precision : 0.4
jaccard : 0.25 

doc 3 : ['difference', 'environment', 'anaconda', 'environment']
real tags : ['python', 'pycharm', 'anaconda', 'environment', 'virtual-environment']
predicted : ['python', 'anaconda', 'java', 'virtualenv', 'conda']
precision : 0.4
jaccard : 0.25 

doc 4 : [

In [59]:
# test : load the model

# load model
# model_uri = f'runs:/{run_id}/random_forest_classifier'
model_uri = f"/home/ubuntu/Bureau/OC/Projet5_oudelet_kevin/1_4_notebooks/artifacts/{run_id[0]}/artifacts/special_knn"

knn_loaded = mlflow.sklearn.load_model(model_uri=model_uri)

# Comparons :

# knn d'origine...
print(f"precision_moyenne knn d'origine : {precision_ref}")
print(f"jaccard_moyen knn d'origine : {jaccard_ref}", '\n')

# Je n'arrive pas à comprendre comment j'arrive à acceder directement à precision_ref et jaccard_ref
# alors que les variables st definies ds le scope local de la boucle while.
# run_id, au meme scope, n'est pas accesible directement ???


precision_moyenne knn d'origine : 0.264
jaccard_moyen knn d'origine : 0.17142857142857137 



In [62]:
# ... vs knn_reloaded

# same code for preprocessing

target='all_tags'
n=5

documents = train_df[feature].tolist()
gensim_dictionary = Dictionary(documents)
corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]
dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T
target_values = train_df[target].values

# no training, already done

# eval

predictions=[]
min_range=0
max_range=25 # test.shape[0]=4767
for i in range(min_range, max_range):
    query_document = test_df[feature][i]
    print(f'doc {i} : {query_document}')
    print(f'real tags : {test_df[target][i]}')
    query_bow = gensim_dictionary.doc2bow(query_document)
    query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

    # Find nearest neighbors
    _, indices = knn_loaded.kneighbors(query_vector)

    # Aggregate tags from neighbors
    neighbor_tags = [tag for i in indices.flatten() for tag in train_df.iloc[i][target]]

    # Predict tags based on most common tags among neighbors
    predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common(n=n)]
    # 5 tags/question en moyenne mais on peut suggérer +
    predictions.append(predicted_tags)
    print(f'predicted : {predicted_tags}')
    print(f'precision : {precision_topics(test_df[target][i], predicted_tags)}')
    print(f'jaccard : {jaccard_similarity(test_df[target][i], predicted_tags)}', '\n')

true_tags = [tags for tags in test_df[target][min_range:max_range]]

# eval
scorer = precision_score
precision = scorer(true_tags, predictions)

scorer = jaccard_score
jaccard = scorer(true_tags, predictions)

# Resultat
print(f"precision_moyenne knn_loaded : {precision}")
print(f"jaccard_moyen knn_loaded : {jaccard}", '\n')

# Nickel, identique.


doc 0 : ['find', 'class', 'com', 'google', 'firebase', 'provider']
real tags : ['java', 'android', 'android-studio', 'error-handling', 'compiler-errors']
predicted : ['java', 'javascript', 'php', 'laravel', 'c++']
precision : 0.2
jaccard : 0.1111111111111111 

doc 1 : ['get', 'lang', 'noclassdeffounderror', 'scala', 'run', 'code']
real tags : ['java', 'scala', 'maven', 'noclassdeffounderror', 'scala-ide']
predicted : ['c', 'java', 'python', 'c#', 'ios']
precision : 0.2
jaccard : 0.1111111111111111 

doc 2 : ['django', 'bulk', 'create', 'ignore', 'duplicate']
real tags : ['python', 'mysql', 'django', 'bulkinsert', 'bulk']
predicted : ['django', 'python', 'xcode', 'ios', 'swift']
precision : 0.4
jaccard : 0.25 

doc 3 : ['difference', 'environment', 'anaconda', 'environment']
real tags : ['python', 'pycharm', 'anaconda', 'environment', 'virtual-environment']
predicted : ['python', 'anaconda', 'java', 'virtualenv', 'conda']
precision : 0.4
jaccard : 0.25 

doc 4 : ['form', 'action', 'onsu

### 2.8 Nested mlflow runs pour recherche manuelle du nb optimal de voisins


In [None]:
# tried different values here, see ui


with mlflow.start_run(run_name="k_opti_small_values", experiment_id=experiment_id) as parent:
    precision = []
    k_values = list(range(1, 25, 2))

    for k in k_values:
        with mlflow.start_run(run_name=f'child_{k}', nested=True) as child:
            print(f'RUN ID child_{k}:', child.info.run_id)

            parameters = {
                "preprocessing": 'bow',
                "feature": 'title_nltk',
                "k neighbors": k,
                "target": 'all_tags',
                "nb_tags_predicted": 5
            }
            mlflow.log_params(parameters)

            _, precision_moyenne, jaccard_moyen = predict_tags_using_knn(k=k)

            metrics = {
                "precision": precision_moyenne,
                "jaccard": jaccard_moyen
            }
            # multiple metrics
            mlflow.log_metrics(metrics)

            precision.append((k, jaccard_moyen))

    jaccard_moyen = [item[1] for item in precision]

    # Plot precision scores against k values
    plt.plot(k_values, jaccard_moyen, marker='o')
    plt.xlabel('k')
    plt.ylabel('Precision')
    plt.title('Jaccard Score for Different k Values')
    plt.grid(True)

    # Save the plot
    plt.savefig("./artifacts/jaccard_plot_1_25.png")
    # Log the saved figure using MLflow
    mlflow.log_artifact("./artifacts/jaccard_plot_1_25.png")

    # Show the plot (optional)
    plt.show()


# k=13 sur la moitie des donnes, jaccard = 0.11
# k=19 sur donnes completes, jaccard = 0.17


### 2.9 Influence du nb de topics prédits sur la précision


In [None]:
with mlflow.start_run(run_name="tags_opti", experiment_id=experiment_id) as parent:
    precision = []
    nb_tags = [3, 4, 5, 10, 20]

    for n in nb_tags:
        with mlflow.start_run(run_name=f'child_{n}', nested=True) as child:
            print(f'RUN ID child_{n}:', child.info.run_id)

            parameters = {
                "preprocessing": 'bow',
                "feature": 'title_nltk',
                "k neighbors": 13,
                "target": 'all_tags',
                "nb_tags_predicted": n
            }
            mlflow.log_params(parameters)

            _, precision_moyenne, jaccard_moyen = predict_tags_using_knn(k=13, n=n)

            metrics = {
                "precision": precision_moyenne,
                "jaccard": jaccard_moyen
            }

            # multiple metrics
            mlflow.log_metrics(metrics)

            precision.append((n, jaccard_moyen))

    jaccard_moyen = [item[1] for item in precision]

    # Plot precision scores against k values
    plt.plot(nb_tags, jaccard_moyen, marker='o')
    plt.xlabel('n')
    plt.ylabel('Precision')
    plt.title('Precision Score for Different n Values')
    plt.grid(True)

    # Save the plot
    plt.savefig("./artifacts/jaccard_plot_3_4_tags.png")
    # Log the saved figure using MLflow
    mlflow.log_artifact("./artifacts/jaccard_plot_3_4_tags.png")

    # Show the plot (optional)
    plt.show()


### 2.10 Preprocessing (feature et target)


In [None]:
# Nous avons besoin de transformer la target pdt le preprocessing (-> bow)
# mm si on ne s'sn sert pas vraiment, car grid_search.fit() n'accepte que des valeurs numériques.

# Avantage : on peut utiliser des metriques classiques pour le score (ici r2),
# mais ca n'a aucun sens metier interpretable.
# peut tjs etre utile si fortement correlé à notre precision score custom
# ou au score de similarité Jaccard

# Compliqué à faire dans le pipeline sckikit, qui transforme les features mais pas la target.
# TransformedTargetRegressor ne convient pas non plus ici : c'est un modele wrapper,
# utilisé apres le pipeline.
# trouvé qq "solutions" + ou - elegantes, mais rien de compatible à la fois avec sklearn et mlflow.

# Ici convertir les tags en bag of words ou les one hot encoder revient exactement au meme, donc
# autant utiliser le bow, on a deja le transformeur.


In [None]:
# Plus tard je vais me rendre compte que ce serait bcp mieux de tjs utiliser le mm dico ici


def token_list_into_bow(X):
    documents = X.tolist()
    # print(documents)
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    bow_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    return bow_matrix, gensim_dictionary


### 2.11 (tentative de) gridsearch


In [None]:
# Prend trop de ressources !
# les runs mlflow sont + pratiques pour encapsuler le code

# En plus je me demande si la gridsearch est une bonne idée pour le knn.
# deja pour la lda et la nmf j'avais des doutes
# a verifier mais pour ces modeles je pense qu'un dataset d'entrainement plus petit peut
# serieusement impacter le nb optimal pour l'hyperparam
# (voisins pour le knn, topics pour lda et nmf)

# tester hyperopt ?


def pipe_knn(train_df=train_df, feature=feature, target='all_tags', test_df=test_df, metric='cosine'):
    # Load your training data and labels
    X_train = train_df[feature].values
    y_train = train_df[target].values

    X_bow_matrix, _ = token_list_into_bow(X_train)
    y_bow_matrix, _ = token_list_into_bow(y_train)

    # Create a KNN Regressor
    knn_regressor = KNeighborsRegressor(metric=metric)

    # Create a pipeline with preprocessing and a knn regressor, to simplify gridsearch
    pipe = Pipeline(steps=[
        ("knn_regressor", knn_regressor)
    ])

    # Define hyperparameters and their possible values for grid search
    param_grid = {
        'knn_regressor__n_neighbors': [11, 13, 15],
        'knn_regressor__weights': ['uniform'] # , 'distance'
    }

    # Create the GridSearchCV object with multiple scoring metrics
    # scoring = {'neg_mean_squared_error': 'neg_mean_squared_error', 'r2': 'r2'}
    grid_search = GridSearchCV(pipe, param_grid=param_grid,
                            scoring='r2', cv=3, verbose=1) # add, refit='precision' for multiple scoring

    # Fit the GridSearchCV object to your training data to perform hyperparameter tuning
    grid_search.fit(X_bow_matrix, y_bow_matrix)

    # Access the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    # Create the KNN regressor with the best hyperparameters
    best_knn_regressor = KNeighborsRegressor(# metric=metric,
                                             n_neighbors=best_params['knn_regressor__n_neighbors'],
                                             weights=best_params['knn_regressor__weights'])

    # Create a pipeline with the preprocessor and the tuned knn regressor
    pipeline_with_tuned_knn = Pipeline(steps=[
        ("knn_regressor", best_knn_regressor)  # Use the tuned neighbor and weight values here
    ])

    # Perform cross-validation (on training set) and display the scores for each split
    # scoring = ['r2', 'neg_mean_squared_error']
    cv_scores = cross_validate(pipeline_with_tuned_knn, X_bow_matrix, y_bow_matrix, cv=5, scoring='r2')
    # print("Cross-Validation Scores (training):", '\n', cv_scores)
    print("Cross-Validation Scores:")
    pprint(cv_scores)
    for i, score in enumerate(cv_scores['test_score']):
        print(f"Split {i+1} : precision = {score}")


# crash le kernel
# pipe_knn()


### Checkpoint


In [None]:
quick_df = raw_data[::100]

train_df, test_df = train_test_split(quick_df, test_size=0.1, random_state=42)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


In [None]:
# On oublie la gridsearchcv, mais on garde le scorer
# pour evaluer notre modele sur tt le training / testing set
# (pas seulement les 25 premieres lignes du set)


def pipe_knn(train_df=train_df, feature=feature, target='all_tags', test_df=test_df, metric='cosine'):
    # Load your training data and labels
    X_train = train_df[feature].values
    y_train = train_df[target].values

    X_bow_matrix, _ = token_list_into_bow(X_train)
    y_bow_matrix, _ = token_list_into_bow(y_train)

    # Create a KNN Regressor
    knn_regressor = KNeighborsRegressor(metric=metric)

    # Define hyperparameters and their possible values for tuning
    params = {
        'n_neighbors': 13,
        'weights': 'uniform'
    }

    # Create the KNN regressor with the specified hyperparameters
    knn_regressor.set_params(**params)

    # Fit the KNN regressor to your training data
    knn_regressor.fit(X_bow_matrix, y_bow_matrix)

    # Evaluate the model using a scorer
    scorer = make_scorer(r2_score)
    score = scorer(knn_regressor, X_bow_matrix[::100], y_bow_matrix[::100])
    print("R2 Score:", score)

    return knn_regressor


# Demande bcp trop de ressources sur le training set complet. (31.5 GiB pour y_pred)
# Il faut se contenter d'un sample.
# Utiliser un set de validation ici ?

# Du coup pour pouvoir utiliser jaccard en score, on a besoin de la méthode .predict() des classifiers.

pipe_knn()


## 3 Utilisation d'un classifier multilabel


### 3.1 nb de tags pris en compte


In [None]:
# Utiliser un regresseur permet de facilement travailler avec l'ensemble des tags, mais
# nécessite une méthode .predict() custom, et les prédictions sont très lentes :
# impossible d'évaluer notre modèle sur une partie importante du dataset,
# on doit se contenter d'un petit sample.

# Voyons si un regresseur multilabel peut résoudre ces 2 problèmes


In [None]:
# Reconstruire le dictionnaire {tag : frequence}
# pour travailler sur les n tags les + frequents

all_tags = [tag for tags in raw_data['all_tags'] for tag in tags]
print(f'Il y a {len(all_tags)} tags au total. \n')
print(f'Il y a {len(set(all_tags))} tags différents. \n')

# display(questions_tags)

# Fréquence
tag_frequencies_dict = {}
for tag in all_tags:
    tag_frequencies_dict[tag] = tag_frequencies_dict.get(tag, 0) + 1

# Sort the dictionary items by values in descending order
sorted_tag_frequencies = dict(sorted(tag_frequencies_dict.items(), key=lambda item: item[1], reverse=True))

# Extract tags and frequencies
tags = list(sorted_tag_frequencies.keys())
frequencies = list(sorted_tag_frequencies.values())

df_freq = pd.DataFrame({'Tag': tags, 'Frequency': frequencies})


In [None]:
def keep_most_frequent_tags(list_tags, df_freq=df_freq, lim=100):
    """
    Keep only the n most frequent tags (default 100).

    Parameters:
    - liste_tokens (list): The input list of tokens to be processed.
    - forbidden_list (set): A set of forbidden tokens, which includes stop words,
      tokens exceeding upper frequency limits, and tokens falling below lower frequency limits.

    Returns:
    list: A filtered list of tokens that excludes stop words, tokens exceeding upper frequency limits,
    and tokens falling below lower frequency limits.
    """

    mandatory_list = df_freq['Tag'][:lim].tolist()
    filtered_list = [token for token in list_tags if token in mandatory_list]

    return filtered_list


raw_data['top_tags_50'] = raw_data['all_tags'].apply(
    lambda x: keep_most_frequent_tags(x)
)

 # info targets vides
print(f'nb de targets desormais vides : {raw_data.loc[raw_data["top_tags_50"].apply(len) == 0, :].shape[0]} \
       (/ 50 000)', '\n')

display(raw_data.head())


In [None]:
# MultiLabelBinarizer: Does not handle missing values directly. Missing values need to be handled
# before applying the transformation.
# not so sure about that


print(raw_data.shape)

data_50 = raw_data.loc[raw_data["top_tags_50"].apply(len) >= 0, :].copy()

print(data_50.shape, '\n')

display(data_50.head())


### Checkpoint


In [None]:
quick_df = data_50[::100]

train_df, test_df = train_test_split(quick_df, test_size=0.1, random_state=42)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


### 3.2 dictionnaire (gensim) standard


In [None]:
# Nécessaire pour predire sans data leak

_, standard_dict_title = token_list_into_bow(train_df[feature])


def token_list_into_bow_using_specific_dict(X, dict=standard_dict_title):
    documents = X.tolist()
    # print(documents)
    gensim_dictionary = dict
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    bow_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    return bow_matrix


### 3.3 Score : comparaison training / testing set


In [None]:
def knn_classifier(train_df=train_df, feature=feature, k=13, target='top_tags_50', test_df=test_df,
                   dict=standard_dict_title):

    # Load training data and labels
    X_train = train_df[feature].values
    X_test = test_df[feature].values
    y_train = train_df[target].tolist()
    y_test = test_df[target].tolist()

    # training set
    X_bow_matrix = token_list_into_bow_using_specific_dict(X_train, dict)
    # testing set
    X_test_bow_matrix = token_list_into_bow_using_specific_dict(X_test, dict)

    # target encoding
    mlb = MultiLabelBinarizer()
    y_encoded = mlb.fit_transform(y_train)
    y_test_encoded = mlb.fit_transform(y_test)

    # Create a KNN classifier this time
    knn = KNeighborsClassifier()

    # Define hyperparameters and their possible values for tuning
    params = {
        'n_neighbors': k
    }

    # Create the KNN regressor with the specified hyperparameters
    knn.set_params(**params)

    # Fit the KNN regressor to your training data
    knn.fit(X_bow_matrix, y_encoded)

    # Evaluate the model on training data using precision
    print('SCORES TAGS ENCODÉS (MLB)', '\n')
    predictions = knn.predict(X_bow_matrix)

    scorer = precision_score
    score = scorer(predictions, y_encoded)
    print("Precision score (training set): ", score)

    # Evaluate the model on training data using jaccard similarity
    scorer = jaccard_score
    score = scorer(predictions, y_encoded)
    print("Jaccard similarity (training set): ", score)
    print('\n')

    # Evaluate the model on test data
    predictions = knn.predict(X_test_bow_matrix)
    scorer = precision_score
    score = scorer(predictions, y_test_encoded)
    print("Precision score (testing set): ", score)

    scorer = jaccard_score
    score = scorer(predictions, y_test_encoded)
    print("Jaccard similarity (testing set): ", score)

    return knn, mlb, dict


# knn_100, mlb_100, dict_100 = knn_classifier()

# pb precision ??
# OK hypothèse : j'ai modifié la fonction precision_score() pour qu'elle ne tienne pas compte de l'ordre.
# C'était + pratiquee pour comparer les tags predits aux tags reels (sous forme de listes de tokens)
# Mais si on les compare sous forme encodée, alors l'ordre importe. Pour ma fonction presque tous les
# mots sont identiques : 000000000000000000010000000000000000000...

# Solution : utiliser notre fonction custom seulement pour comparer les tokens,
# mais pas les representations bag of words.

# checker exemples


In [None]:
def knn_classifier(train_df=train_df, feature=feature, k=13, target='top_tags_50', test_df=test_df,
                   dict=standard_dict_title):

    # Load training data and labels
    X_train = train_df[feature].values
    X_test = test_df[feature].values
    y_train = train_df[target].tolist()
    y_test = test_df[target].tolist()

    # training set
    X_bow_matrix = token_list_into_bow_using_specific_dict(X_train, dict)
    # testing set
    X_test_bow_matrix = token_list_into_bow_using_specific_dict(X_test, dict)

    # target encoding
    mlb_train = MultiLabelBinarizer()
    y_encoded = mlb_train.fit_transform(y_train)

    # Create a KNN classifier this time
    knn = KNeighborsClassifier()

    # Define hyperparameters and their possible values for tuning
    params = {
        'n_neighbors': k
    }

    # Create the KNN regressor with the specified hyperparameters
    knn.set_params(**params)

    # Fit the KNN regressor to your training data
    knn.fit(X_bow_matrix, y_encoded)

    predictions_train = knn.predict(X_bow_matrix)
    predictions_test = knn.predict(X_test_bow_matrix)

    print('SCORES TAGS (TOKENS)', '\n')

    predicted_tags_train = mlb_train.inverse_transform(predictions_train)
    predicted_tags_test = mlb_train.inverse_transform(predictions_test)

    # Evaluate the model on training data using precision
    scorer = precision_score
    score = scorer(predicted_tags_train, y_train)
    print("Precision score (training set): ", score)

    # Evaluate the model on training data using jaccard similarity
    scorer = jaccard_score
    score = scorer(predicted_tags_train, y_train)
    print("Jaccard similarity (training set): ", score)
    print('\n')

    # Evaluate the model on test data
    scorer = precision_score
    score = scorer(predicted_tags_test, y_test)
    print("Precision score (testing set): ", score)

    scorer = jaccard_score
    score = scorer(predicted_tags_test, y_test)
    print("Jaccard similarity (testing set): ", score)


    return knn, mlb_train, dict


knn_50, mlb_50, dict_50 = knn_classifier()

# Ah, enfin qqch de coherent !
# bcp moins bon du coup, mais coherent

# En fait la précision "profite" des cas, qui semblent nombreux ici, où un seul tag est prédit.
# Jaccard est vraiment + robuste


### Checkpoint (+ choix target)


In [None]:
def create_top_tags_feature(raw_data=raw_data, n=50, slice=20):
    raw_data['top_tags'] = raw_data['all_tags'].apply(
        lambda x: keep_most_frequent_tags(x, lim=n)
    )

    # Pas sûr que la ligne suivante soit indispensable
    data = raw_data.loc[raw_data['top_tags'].apply(len) >= 0, :].copy()
    print(data.shape, '\n')

    slice_df = data[::slice]
    # slice_df.reset_index(drop=True, inplace=True)

    train_df, test_df = train_test_split(slice_df, test_size=0.1, random_state=42)

    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)


    return train_df, test_df


train_df, test_df = create_top_tags_feature(n=30, slice=100)
_, standard_dict_title = token_list_into_bow(train_df[feature])


In [None]:
display(raw_data.head())


In [None]:
knn_50, mlb_50, dict_50 = knn_classifier(train_df=train_df, k=19, target='top_tags', test_df=test_df)

# Ici je teste differentes valeurs pour le nb de top_tags.
# J'espérais qu'en prenant en compte davantage de tags, on aurait moins de targets vides
# et donc (hopefully) moins de prédictions nulles.

# Problèmes : le tps d'entrainement explose...
# # Si on garde les 5000 tags les + frequents on passe de qq minutes (2 min pour 50 tags) à...
# c'est long ! (+ d'une heure)
# Pourtant les targets st tjs assez vides, et bcp de prédictions .predict() st tjs nulles.

# ... et le score chute.

# En fait mon knn bricolé au début était pas si mal


In [None]:
def string_query_into_bow_vector(query, dictionary):
    query_token_list = preprocess_text(query)
    query_bow = dictionary.doc2bow(query_token_list)
    query_vector = corpus2dense([query_bow], num_terms=len(dictionary)).T

    return query_vector


### 3.4 predict...


In [None]:
# predict


def predict_tags(test_data, model, mlb_encoder, dictionary):
    # Transform test features
    query = string_query_into_bow_vector(test_data, dictionary)

    # Make predictions
    predicted_labels = model.predict(query)

    # Inverse transform predicted labels
    predicted_tags_tuple = mlb_encoder.inverse_transform(predicted_labels)
    predicted_tags = list(predicted_tags_tuple[0]) # mlb retourne un array

    return query, predicted_tags


def test_prediction(index=0):
    test_data = test_df['title'][index]
    _, predicted_tags = predict_tags(test_data, knn_50, mlb_50, dict_50)

    print(f"All tags : {test_df['all_tags'][index]}")
    print(f"Target : {test_df['top_tags'][index]}")
    print(f"Predicted : {predicted_tags}", '\n')

    # scores
    scorer = precision_topics
    score = scorer(predicted_tags, test_df['top_tags'][index])
    print("Precision score : ", score)

    scorer = jaccard_similarity
    score = scorer(predicted_tags, test_df['top_tags'][index])
    print("Jaccard similarity : ", score)
    print('\n')


test_prediction()


In [None]:
for i in range(0, 10):
    test_prediction(i)

# bcp de predictions nulles


### 3.5 ... VS predict_proba


In [None]:
def always_predict_tags(test_data, knn_model=knn_50, mlb_encoder=mlb_50, dictionary=dict_50, top_n=5):
    # Transform test features
    query = string_query_into_bow_vector(test_data, dictionary)

    # Make predictions and get probabilities
    probabilities = knn_model.predict_proba(query)
    # print(probabilities[:3])

    # Extract tags from the mlb_encoder
    tags = mlb_encoder.classes_

    # Create a dictionary to store results
    results = {}

    # Iterate over probabilities and tags
    for i, tag in enumerate(tags):
        # Extract the probability for the current tag
        probability = probabilities[i][0][0]
        # Store the probability with the tag as the key
        results[tag] = probability

    # Sort the dictionary items by values in descending order
    sorted_results = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))
    predicted_tags = list(sorted_results)[:top_n]
    # Construct a list of tuples containing tags and their corresponding probabilities
    tags_with_probas = [(tag, sorted_results[tag]) for tag in predicted_tags]

    return tags_with_probas


def test_prediction(index=0):
    predicted_tags = always_predict_tags(test_df['title'][index])
    print(test_df['all_tags'][index])
    print(test_df['top_tags'][index])
    print(predicted_tags, '\n')


test_prediction()

# bcp de probas == 1 alors qu'on n'avait pas de prediction (ou 1) sur cet exemple ??
# On a exactement le problème inverse !


In [None]:
for i in range(0, 10):
    test_prediction(i)

# bcp trop de valeurs à 1
# predit svt la mm chose (ordre alphabetique)

# pour le knn on peut tjs utiliser notre predict custom, mais pour les autres modeles
# il faudra une difference plus nette.


## 4 Autres classifieur multilabels


### 4.1 Logistic regression


### checkpoint


In [None]:
train_df, test_df = create_top_tags_feature(n=100, slice=20)
_, standard_dict_title = token_list_into_bow(train_df[feature])


In [None]:
# pas nativement mutilabel -> approche one vs rest via MultiOutputClassifier()


def logistic_regression_classifier(train_df=train_df, feature=feature, dict=standard_dict_title,
                                   target='top_tags', test_df=test_df):

    X_train = train_df[feature].values
    X_test = test_df[feature].values
    y_train = train_df[target].tolist()
    y_test = test_df[target].tolist()

    # training set
    X_bow_matrix = token_list_into_bow_using_specific_dict(X_train, dict)
    # testing set
    X_test_bow_matrix = token_list_into_bow_using_specific_dict(X_test, dict)

    # target encoding
    mlb_train = MultiLabelBinarizer()
    y_encoded = mlb_train.fit_transform(y_train)

    # Create a Logistic Regression classifier
    logistic_reg = LogisticRegression(random_state=42)

    # Create a MultiOutputClassifier with the Logistic Regression classifier
    multi_output_classifier = MultiOutputClassifier(logistic_reg)

    # Fit the MultiOutputClassifier to your training data
    multi_output_classifier.fit(X_bow_matrix, y_encoded)
    print('training fini', '\n')

    predictions_train = multi_output_classifier.predict(X_bow_matrix)
    predictions_test = multi_output_classifier.predict(X_test_bow_matrix)

    print('SCORES TAGS (TOKENS)', '\n')

    predicted_tags_train = mlb_train.inverse_transform(predictions_train)
    predicted_tags_test = mlb_train.inverse_transform(predictions_test)

    # Evaluate the model on training data using precision
    scorer = precision_score
    score = scorer(predicted_tags_train, y_train)
    print("Precision score (training set): ", score)

    # Evaluate the model on training data using jaccard similarity
    scorer = jaccard_score
    score = scorer(predicted_tags_train, y_train)
    print("Jaccard similarity (training set): ", score)
    print('\n')

    # Evaluate the model on test data
    scorer = precision_score
    score = scorer(predicted_tags_test, y_test)
    print("Precision score (testing set): ", score)

    scorer = jaccard_score
    score = scorer(predicted_tags_test, y_test)
    print("Jaccard similarity (testing set): ", score)


    return multi_output_classifier, mlb_train


multi_output_classifier, mlb_50 = logistic_regression_classifier()

# create_top_tags_feature(n=100, slice=3)

# Precision score (training set):  0.5929012654012678
# Jaccard similarity (training set):  0.44822494172494076

# Precision score (testing set):  0.45291588000839084
# Jaccard similarity (testing set):  0.32760196589649054


In [None]:
# predict


def test_prediction(index, model):
    query = test_df['title'][index]
    _, predicted_tags = predict_tags(query, model, mlb_50, standard_dict_title)
    print(test_df['all_tags'][index])
    print(test_df['top_tags'][index])
    print(predicted_tags, '\n')


test_prediction(0, multi_output_classifier)


In [None]:
for i in range(1,20):
    test_prediction(i, multi_output_classifier)

# Pas mal au début, bcp de vide ensuite


In [None]:
# predict proba


def test_prediction_proba(index, model):
    query = test_df['title'][index]
    predicted_tags = always_predict_tags(query, model, mlb_50, standard_dict_title)
    print(test_df['all_tags'][index])
    print(test_df['top_tags'][index])
    print(predicted_tags, '\n')


for i in range(0,20):
    test_prediction_proba(i, multi_output_classifier)

# Par rapport aux résultats du knn, au moins ici il varie les réponses
# mais les prédictions semblent très mauvaises.

# Je ne comprends pas pourquoi on ne retrouve pas les mm prédictions qd on utilise
# .predict() ou .predict_proba() ?? Ce sont 2 méthodes du mm modèle..


### 4.2 SGDClassifier


### Checkpoint


In [None]:
train_df, test_df = create_top_tags_feature(n=100, slice=20)
_, standard_dict_title = token_list_into_bow(train_df[feature])


In [None]:
# using default SVM


def sgd_classifier(train_df=train_df, feature=feature, target='top_tags',
                   dict=standard_dict_title, test_df=test_df):

    # Load your training data and labels
    X_train = train_df[feature].values
    X_test = test_df[feature].values
    y_train = train_df[target].tolist()
    y_test = test_df[target].tolist()

    # training set
    X_bow_matrix = token_list_into_bow_using_specific_dict(X_train, dict)
    # testing set
    X_test_bow_matrix = token_list_into_bow_using_specific_dict(X_test, dict)

    # target encoding
    mlb_train = MultiLabelBinarizer()
    y_encoded = mlb_train.fit_transform(y_train)

    # Create an SGDClassifier with default parameters
    sgd_clf = SGDClassifier(random_state=42)  # default loss='hinge', gives a linear SVM

    # Create a MultiOutputClassifier with the SGD classifier
    multi_output_classifier = MultiOutputClassifier(sgd_clf)

    # Fit the MultiOutputClassifier to your training data
    multi_output_classifier.fit(X_bow_matrix, y_encoded)
    print('training fini', '\n')

    predictions_train = multi_output_classifier.predict(X_bow_matrix)
    predictions_test = multi_output_classifier.predict(X_test_bow_matrix)

    print('SCORES TAGS (TOKENS)', '\n')

    predicted_tags_train = mlb_train.inverse_transform(predictions_train)
    predicted_tags_test = mlb_train.inverse_transform(predictions_test)

    # Evaluate the model on training data using precision
    scorer = precision_score
    score = scorer(predicted_tags_train, y_train)
    print("Precision score (training set): ", score)

    # Evaluate the model on training data using jaccard similarity
    scorer = jaccard_score
    score_train = scorer(predicted_tags_train, y_train)
    print("Jaccard similarity (training set): ", score_train)
    print('\n')

    # Evaluate the model on test data
    scorer = precision_score
    score = scorer(predicted_tags_test, y_test)
    print("Precision score (testing set): ", score)

    scorer = jaccard_score
    score_test = scorer(predicted_tags_test, y_test)
    print("Jaccard similarity (testing set): ", score_test)

    return multi_output_classifier, mlb_train, score_train, score_test


multi_output_classifier_svm, mlb_50, jaccard_train, jaccard_test = sgd_classifier()

# tres bon score, bcp + rapide


In [None]:
# predict (best score)

for i in range(0,20):
    test_prediction(i, multi_output_classifier_svm)

# Pas mal du tout !
# Encore bcp de prédictions vides


In [None]:
# predict proba
# test_prediction_proba(0, multi_output_classifier_svm)

# Ne fonctionne pas, la SVM du SGDClassifier ne possède pas la méthode predict_proba


In [None]:
# using logistic reg from SGDClassifier (loss='log_loss')


def sgd_classifier(train_df=train_df, feature=feature, target='top_tags',
                   dict=standard_dict_title, test_df=test_df):

    X_train = train_df[feature].values
    X_test = test_df[feature].values
    y_train = train_df[target].tolist()
    y_test = test_df[target].tolist()

    # training set
    X_bow_matrix = token_list_into_bow_using_specific_dict(X_train, dict)
    # testing set
    X_test_bow_matrix = token_list_into_bow_using_specific_dict(X_test, dict)

    # target encoding
    mlb_train = MultiLabelBinarizer()
    y_encoded = mlb_train.fit_transform(y_train)

    # Create an SGDClassifier with default parameters
    sgd_clf = SGDClassifier(random_state=42, loss='log_loss')  # for logistic regression

    # Create a MultiOutputClassifier with the SGD classifier
    multi_output_classifier = MultiOutputClassifier(sgd_clf)

    # Fit the MultiOutputClassifier to your training data
    multi_output_classifier.fit(X_bow_matrix, y_encoded)
    print('training fini', '\n')

    predictions_train = multi_output_classifier.predict(X_bow_matrix)
    predictions_test = multi_output_classifier.predict(X_test_bow_matrix)

    print('SCORES TAGS (TOKENS)', '\n')

    predicted_tags_train = mlb_train.inverse_transform(predictions_train)
    predicted_tags_test = mlb_train.inverse_transform(predictions_test)

    # Evaluate the model on training data using precision
    scorer = precision_score
    score = scorer(predicted_tags_train, y_train)
    print("Precision score (training set): ", score)

    # Evaluate the model on training data using jaccard similarity
    scorer = jaccard_score
    score_train = scorer(predicted_tags_train, y_train)
    print("Jaccard similarity (training set): ", score_train)
    print('\n')

    # Evaluate the model on test data
    scorer = precision_score
    score = scorer(predicted_tags_test, y_test)
    print("Precision score (testing set): ", score)

    scorer = jaccard_score
    score_test = scorer(predicted_tags_test, y_test)
    print("Jaccard similarity (testing set): ", score_test)

    return multi_output_classifier, mlb_train, score_train, score_test


multi_output_classifier_lr, mlb_50, _, _ = sgd_classifier()


In [None]:
# predict

for i in range(0,20):
    test_prediction(i, multi_output_classifier_lr)

# similaire à la LR testée + haut (4.1)


In [None]:
# predict_proba

for i in range(0,20):
    test_prediction_proba(i, multi_output_classifier_lr)

# idem


### 4.3 Random forest


### (checkpoint)


In [None]:
# train_df, test_df = create_top_n_tags_feature(n=50, slice=50)
# _, standard_dict_title = token_list_into_bow(train_df[feature])


In [None]:
def random_forest_classifier(train_df=train_df, feature=feature, target='top_tags',
                             dict=standard_dict_title, test_df=test_df):

    # Load your training data and labels
    X_train = train_df[feature].values
    X_test = test_df[feature].values
    y_train = train_df[target].tolist()
    y_test = test_df[target].tolist()

    # training set
    X_bow_matrix = token_list_into_bow_using_specific_dict(X_train, dict)
    # testing set
    X_test_bow_matrix = token_list_into_bow_using_specific_dict(X_test, dict)

    # target encoding
    mlb_train = MultiLabelBinarizer()
    y_encoded = mlb_train.fit_transform(y_train)

    # Create a Random Forest Classifier with default parameters
    rf_clf = RandomForestClassifier(random_state=42)

    # Create a MultiOutputClassifier with the Random Forest Classifier
    multi_output_classifier = MultiOutputClassifier(rf_clf)

    # Fit the MultiOutputClassifier to your training data
    multi_output_classifier.fit(X_bow_matrix, y_encoded)
    print('training fini', '\n')

    predictions_train = multi_output_classifier.predict(X_bow_matrix)
    predictions_test = multi_output_classifier.predict(X_test_bow_matrix)

    print('SCORES TAGS (TOKENS)', '\n')

    predicted_tags_train = mlb_train.inverse_transform(predictions_train)
    predicted_tags_test = mlb_train.inverse_transform(predictions_test)

    # Evaluate the model on training data using precision
    scorer = precision_score
    score = scorer(predicted_tags_train, y_train)
    print("Precision score (training set): ", score)

    # Evaluate the model on training data using jaccard similarity
    scorer = jaccard_score
    score = scorer(predicted_tags_train, y_train)
    print("Jaccard similarity (training set): ", score)
    print('\n')

    # Evaluate the model on test data
    scorer = precision_score
    score = scorer(predicted_tags_test, y_test)
    print("Precision score (testing set): ", score)

    scorer = jaccard_score
    score = scorer(predicted_tags_test, y_test)
    print("Jaccard similarity (testing set): ", score)

    return multi_output_classifier, mlb_train


multi_output_classifier_rf, mlb_50_rf = random_forest_classifier()

# + lent
# score wow


In [None]:
# predict

for i in range(0,20):
    test_prediction(i, multi_output_classifier_rf)

# Plutôt pas mal... qd on a une réponse !
# (tjs le mm problème)


In [None]:
# predict proba

for i in range(0,20):
    test_prediction_proba(i, multi_output_classifier_rf)

# nope


### 4.4 Conclusion / classifiers multilabels


In [None]:
# La SVM a le meilleur score sur le validation set (testing set ici pour simplifier,
# on utilisera un véritable set de validation ds la partie suivante).

# C'est aussi le modèle qui semble donner le moins de prédictions nulles (méthode .predict),
# d'après les qq exemples observés. Mais il en propose encore bcp.

# Les résultats fournis par la méthode .predict_proba sont difficilement exploitables.

# J'en reviens à l'idée d'une fonction .predict custom, comme pour le knn (regresseur) utilisé
# en partie 1. Mais pas sûr comment faire qqch d'équivalent pour les autres modèles
# (les "forcer" à proposer une réponse).

# Problème : la performance du knn modifié est difficile à évaluer (prend trop de tps).


## 5 Drift


In [None]:
raw_data['CreationDate'] = pd.to_datetime(raw_data['CreationDate'])

raw_data['year_month'] = raw_data['CreationDate'].map(lambda dt: dt.strftime('%Y-%m'))

print(raw_data['year_month'].min())
print(raw_data['year_month'].max())


In [None]:
grouped_month = raw_data.groupby('year_month').size().to_frame("count_per_month").reset_index()

fig, ax = plt.subplots(figsize=(12, 3))  # Adjust the figsize as needed

grouped_month.plot(kind='bar', x='year_month', y='count_per_month', fontsize=12, ax=ax, width=0.8)
plt.title('nb questions par mois', fontsize=18, pad=20)
plt.legend(loc='upper right', fontsize=12)
plt.xticks(fontsize=5)
plt.show()


In [None]:
# "Concernant l’analyse de la stabilité du modèle, il est attendu de prendre un dataset par mois de questions
# sur un an, et de mesurer l’évolution des mesures et scores du modèle défini, afin d’en conclure une
# stabilité ou pas d’un modèle de référence établi sur des questions antérieures à cette période.""

# test rapide avec svm

month = []

unique_values = np.sort(raw_data['year_month'].unique())
print(len(unique_values))
print(unique_values)

# On ne va pas utiliser le premier mois (mai 2011) parce que d'après le graph il y a peu de questions ce mois-là.
# Pas idéal pour entrainer notre modele,

for mois in unique_values[1:13]:

    month.append(raw_data.loc[raw_data['year_month'] == mois, :].copy())

print(len(month))


In [None]:
display(month[0].head())


In [None]:
display(month[11].head()) # last one


In [None]:
## ttsplit

train_df, test_df = train_test_split(month[0], test_size=0.1, random_state=42)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# month 0
print('Mois 0 (entrainement) :', '\n')

svm, mlb, jaccard_train_0, jaccard_test_0 = sgd_classifier(train_df=train_df, feature=feature, target='top_tags',
                                                       dict=standard_dict_title, test_df=test_df)

jaccard_train = [jaccard_train_0]
jaccard_test = [jaccard_test_0]


In [None]:
# add mlflow ?
# months 1-11

for i in range(1, 12):
    train_df, test_df = train_test_split(month[i], test_size=0.1, random_state=42)

    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)

    X_train = train_df[feature].values
    y_train = train_df['top_tags'].tolist()

    X_test = test_df[feature].values
    y_test = test_df['top_tags'].tolist()

    # Training set
    X_bow_matrix = token_list_into_bow_using_specific_dict(X_train, standard_dict_title)
    X_test_bow_matrix = token_list_into_bow_using_specific_dict(X_test, standard_dict_title)

    # Target encoding
    # Plus necessaire ici, le modele est deja entraine
    # y_encoded = mlb.fit_transform(y_train)

    # Evaluation
    predictions_train = svm.predict(X_bow_matrix)
    predictions_test = svm.predict(X_test_bow_matrix)

    predicted_tags_train = mlb.inverse_transform(predictions_train)
    predicted_tags_test = mlb.inverse_transform(predictions_test)

    # Evaluate the model on training data using jaccard similarity
    scorer = jaccard_score
    score_train = scorer(predicted_tags_train, y_train)

    # Evaluate the model on test data
    score_test = scorer(predicted_tags_test, y_test)

    jaccard_train.append(score_train)
    jaccard_test.append(score_test)


for i in range(0, 12):
    print(f'Mois {i} : jaccard train = {jaccard_train[i]}, jaccard test = {jaccard_test[i]}')


# graph
# tester evidently AI ?
# use mlflow recipe ?


In [None]:
# Plot Jaccard similarity scores
plt.plot(range(0, 12), jaccard_train, color='grey', label='Jaccard Train')
plt.plot(range(0, 12), jaccard_test, color='blue', label='Jaccard Test')

# Add labels and title
plt.xlabel('Month')
plt.ylabel('Jaccard Similarity')
plt.title('Model Drift: Jaccard Similarity over Months')

# Add legend
plt.legend()

# Show plot
plt.show()

# Assez stable, le trend semble descendant mais légèrement
# "creux" apres 8 mois, le score jaccard qui tourne generalement autour de 0.2 tombe à 0.1


## NN Models


In [None]:
# pb : dim bow trop importante,
# Ca n'a aucun sens d'entrainer un réseau neuronal sur autant de poids / biais
# avec mes ressources
# (mm avec + de ressources, sans doute pas une idée tres efficace..)
# (en + la plupart seraient = 0, pas idéal)

# add Dimensionality Reduction ? (pca? lda/nmf?)

# On comparera plutôt les modèles classiques et le NN (réseau neuronaux) sur les embeddings, denses,
# en partie 4_2 (notebook suivant)
