## **Catégorisez automatiquement des questions**

### partie 4/8 : Prédiction de tags, approche supervisée + tracking mlflow

#### <br> Notebook d’exploration et de pré-traitement des questions, comprenant une analyse univariée et multivariée, un nettoyage des questions, un feature engineering de type bag of words avec réduction de dimension (du vocabulaire et des tags) 

<br>


## Import librairies


In [2]:
import os, sys, random
import ast
import numpy as np
import pandas as pd
from typing import Tuple
from pandarallel import pandarallel
from pprint import pprint
import json
import requests
from collections import Counter

# Visualisation
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from PIL import Image

# NLP
from bs4 import BeautifulSoup
import re, string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

#
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.matutils import corpus2dense
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim import similarities
from gensim.models.ldamulticore import LdaMulticore

#
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import make_scorer, PredictionErrorDisplay
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor


print('\nPython version ' + sys.version)
print('pyLDAvis version ' + pyLDAvis.__version__)

# Modify if necessary
num_cores = os.cpu_count()
print(f"\nNumber of CPU cores: {num_cores}")
pandarallel.initialize(progress_bar=False, nb_workers=6)

#
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import infer_signature, ModelSignature #, Schema, ParamSchema
from mlflow.types import Schema, ParamSchema, ParamSpec, ColSpec

# os.environ['MLFLOW_TRACKING_URI'] = './'

# ! REQUIRES CONSOLE COMMAND : mlflow ui
# Utilisable seulement en local...
mlflow.set_tracking_uri("http://localhost:5000")
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



Python version 3.11.5 (main, Sep 11 2023, 13:23:44) [GCC 11.2.0]
pyLDAvis version 3.4.0

Number of CPU cores: 8
INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### functions


In [3]:
def get_missing_values(df):
    """Generates a DataFrame containing the count and proportion of missing values for each feature.

    Args:
        df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
        pandas.DataFrame: A DataFrame with columns for the feature name, count of missing values,
        count of non-missing values, proportion of missing values, and data type for each feature.
    """
    # Count the missing values for each column
    missing = df.isna().sum()

    # Calculate the percentage of missing values
    percent_missing = df.isna().mean() * 100

    # Create a DataFrame to store the results
    missings_df = pd.DataFrame({
        'column_name': df.columns,
        'missing': missing,
        'present': df.shape[0] - missing,  # Count of non-missing values
        'percent_missing': percent_missing.round(2),  # Rounded to 2 decimal places
        'type': df.dtypes
    })

    # Sort the DataFrame by the count of missing values
    missings_df.sort_values('missing', inplace=True)

    return missings_df

# with pd.option_context('display.max_rows', 1000):
#   display(get_missing_values(df))


def quick_look(df, miss=True):
    """
    Display a quick overview of a DataFrame, including shape, head, tail, unique values, and duplicates.

    Args:
        df (pandas.DataFrame): The input DataFrame to inspect.
        check_missing (bool, optional): Whether to check and display missing values (default is True).

    The function provides a summary of the DataFrame, including its shape, the first and last rows, the count of unique values per column, and the number of duplicates.
    If `check_missing` is set to True, it also displays missing value information.
    """
    print(f'shape : {df.shape}')

    display(df.head())
    display(df.tail())

    print('uniques :')
    display(df.nunique())

    print('Doublons ? ', df.duplicated(keep='first').sum(), '\n')

    if miss:
        display(get_missing_values(df))


In [4]:
def create_mlflow_experiment(
    experiment_name: str, artifact_location: str, tags: dict[str, str]
) -> str:
    """
    Create a new mlflow experiment with the given name and artifact location.

    Parameters:
    ----------
    experiment_name: str
        The name of the experiment to create.
    artifact_location: str
        The artifact location of the experiment to create.
    tags: dict[str,Any]
        The tags of the experiment to create.

    Returns:
    -------
    experiment_id: str
        The id of the created experiment.
    """
    try:
        experiment_id = mlflow.create_experiment(
            name=experiment_name, artifact_location=artifact_location, tags=tags
        )
    except:
        print(f"Experiment {experiment_name} already exists.")
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

    mlflow.set_experiment(experiment_name=experiment_name)

    return experiment_id


def get_mlflow_experiment(
    experiment_id: str = None, experiment_name: str = None
) -> mlflow.entities.Experiment:
    """
    Retrieve the mlflow experiment with the given id or name.

    Parameters:
    ----------
    experiment_id: str
        The id of the experiment to retrieve.
    experiment_name: str
        The name of the experiment to retrieve.

    Returns:
    -------
    experiment: mlflow.entities.Experiment
        The mlflow experiment with the given id or name.
    """
    if experiment_id is not None:
        experiment = mlflow.get_experiment(experiment_id)
    elif experiment_name is not None:
        experiment = mlflow.get_experiment_by_name(experiment_name)
    else:
        raise ValueError("Either experiment_id or experiment_name must be provided.")

    return experiment


def turn_str_back_into_list(df):
    """Correct the type change due to .csv export"""

    df['title_nltk'] = df['title_nltk'].apply(ast.literal_eval)
    df['body_nltk'] = df['body_nltk'].apply(ast.literal_eval)
    df['title_spacy'] = df['title_spacy'].apply(ast.literal_eval)
    df['body_spacy'] = df['body_spacy'].apply(ast.literal_eval)
    df['all_tags'] = df['all_tags'].apply(ast.literal_eval)



In [5]:
# Again, this needs mlfow ui console command first -> unusable on remote server
# all_experiments = client.search_experiments()
# pprint(all_experiments)


### import data


In [6]:
train = pd.read_csv('./../data/cleaned_data/train_bow_uniques.csv', sep=',')
test = pd.read_csv('./../data/cleaned_data/test_bow_uniques.csv', sep=',')

turn_str_back_into_list(train)
turn_str_back_into_list(test)

display(train.tail())

train.shape
print(test.shape)


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
42893,2017-02-23 11:34:31,Do we need clear MDC after HTTP request in Spring,According to this answer thread local variable...,"[java, spring, logging, log4j, logback]","[need, mdc, request, spring]","[need, mdc, request, spring, accord, answer, t...","[need, request]","[accord, answer, thread, variable, use, clear,..."
42894,2011-10-13 20:57:32,How to make i18n with Handlebars.js (mustache ...,I'm currently using Handlebars.js (associated ...,"[javascript, jquery, internationalization, han...","[make, i18n, handlebar, template]","[make, i18n, handlebar, template, associate, b...",[template],"[associate, web, app, client, render, issue, w..."
42895,2012-09-06 00:16:46,How can I make R read my environmental variables?,I am running R on EC2 spot instances and I nee...,"[linux, r, ubuntu, amazon-ec2, environment-var...","[make, read, variable]","[make, read, variable, run, spot, instance, ne...","[read, variable]","[run, spot, instance, need, terminate, cancel,..."
42896,2021-03-23 03:50:50,How to prevent react-query from fetching initi...,I'm using react-query v3.13 to fetch data from...,"[javascript, reactjs, fetch, react-query, swr]","[prevent, query, fetch, enable]","[prevent, query, fetch, enable, data, want, po...","[prevent, react, query, fetch, enable]","[react, query, fetch, datum, want, api, point,..."
42897,2016-03-17 04:19:15,Inserting into table with an Identity column w...,I have a table A_tbl in my database. I have cr...,"[sql, sql-server, database, ssms, database-rep...","[insert, table, identity, column, replication,...","[insert, table, identity, column, replication,...","[insert, table, column, replication, cause, er...","[table, database, create, trigger, capture, in..."


(4767, 8)


## Classic ML models


### knn


In [7]:
# Often gives good results if enough data
# Accepts basically any input, as long as it is numerical

# => Perfect for testing different embeddings !


#### dummy knn : il copie sur le + proche voisin


In [8]:
def predict_tags_using_dummy_knn(df, feature, target, k=1, exemple=None):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    # Ici on ne va pas demander au knn de faire de prediction,
    # On veut juste qu'il trouve les voisins.
    # Mais la fonction fit a besoin de targets en param
    target_values = df[target].values

    # Initialize kNN model
    knn_model = KNeighborsRegressor(n_neighbors=k, metric='cosine', algorithm='brute')
    knn_model.fit(dense_matrix, target_values)

    # Example query
    query_document = exemple
    query_bow = gensim_dictionary.doc2bow(query_document)
    query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

    # Find nearest neighbors
    _, indices = knn_model.kneighbors(query_vector)

    # Aggregate tags from neighbors
    neighbor_tags = [tag for i in indices.flatten() for tag in df.iloc[i][target]]

    print(neighbor_tags)

    # Predict tags based on most common tags among neighbors
    predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common(n=10)]
    # 5 tags/question en moyenne mais on peut suggérer +
    # ici a ameliorer

    return predicted_tags, knn_model


exemple = ["your", 'text', 'document', 'javascript']
# Call the function with your DataFrame and the desired text feature and target tags
predicted_tags, knn_test = predict_tags_using_dummy_knn(train, 'title_nltk', 'all_tags', exemple=exemple)
print(predicted_tags, '\n')

# javascript ok


['javascript', 'internet-explorer', 'class', 'internet-explorer-8', 'classname']
['javascript', 'internet-explorer', 'class', 'internet-explorer-8', 'classname'] 



#### knn basic


In [9]:
# add grid search cv
# add score

def predict_tags_using_knn(df, feature, target, k=50, exemple=None):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    # Ici on ne va pas demander au knn de faire de prediction,
    # On veut juste qu'il trouve les voisins.
    # Mais la fonction fit a besoin de targets en param
    target_values = df[target].values

    # Initialize kNN model
    knn_model = KNeighborsRegressor(n_neighbors=k, metric='cosine', algorithm='brute')
    knn_model.fit(dense_matrix, target_values)

    # Example query
    query_document = exemple
    query_bow = gensim_dictionary.doc2bow(query_document)
    query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

    # Find nearest neighbors
    _, indices = knn_model.kneighbors(query_vector)

    # Aggregate tags from neighbors
    neighbor_tags = [tag for i in indices.flatten() for tag in df.iloc[i][target]]

    print(neighbor_tags)

    # Predict tags based on most common tags among neighbors
    predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common(n=10)]
    # 5 tags/question en moyenne mais on peut suggérer +
    # ici a ameliorer

    return predicted_tags

exemple = ["your", 'text', 'document', 'javascript']
# Call the function with your DataFrame and the desired text feature and target tags
predicted_tags = predict_tags_using_knn(train, 'title_nltk', 'all_tags', exemple=exemple)
print(predicted_tags, '\n')

# javascript ok


['c#', '.net', 'ms-word', 'openxml', 'openxml-sdk', 'javascript', 'internet-explorer', 'class', 'internet-explorer-8', 'classname', 'java', 'javascript', 'ajax', 'selenium', 'htmlunit-driver', 'c#', '.net', 'windows', 'f#', 'console', 'javascript', 'jquery', 'jquery-plugins', 'text-to-speech', 'html5-audio', 'javascript', 'html', 'css', 'text', 'truncate', 'javascript', 'jquery', 'css', 'dom', 'document', 'javascript', 'html', 'string', 'text', 'extract', 'javascript', 'dom', 'substring', 'indexof', 'getselection', 'javascript', 'html', 'function', 'text', 'onclick', 'c#', '.net', 'html', 'pdf', 'extract', 'javascript', 'jquery', 'css', 'copy', 'cut', 'javascript', 'html', 'url', 'base64', 'data-uri', 'python', 'module', 'preprocessor', 'nlp', 'stemming', 'javascript', 'php', 'jquery', 'curl', 'http-headers', 'python', 'text', 'replace', 'ms-word', 'python-docx', 'c#', 'javascript', 'html', 'http', 'dom', 'javascript', 'jquery', 'ruby-on-rails', 'tdd', 'jasmine', 'ios', 'swift', 'heigh

In [10]:
exemple = ["your", 'text', 'document', 'python']
# Call the function with your DataFrame and the desired text feature and target tags
predicted_tags = predict_tags_using_knn(train, 'title_nltk', 'all_tags', exemple=exemple)
print(predicted_tags, '\n')


['python', 'module', 'preprocessor', 'nlp', 'stemming', 'c#', '.net', 'ms-word', 'openxml', 'openxml-sdk', 'python', 'parsing', 'text', 'file-io', 'python-2.7', 'python-2.7', 'ubuntu', 'python-3.x', 'spatial-index', 'r-tree', 'python', 'plot', 'tree', 'data-visualization', 'visualization', 'c#', '.net', 'windows', 'f#', 'console', 'python', 'html', 'web-scraping', 'text', 'beautifulsoup', 'python', 'python-3.x', 'algorithm', 'sorting', 'mergesort', 'python', 'python-2.7', 'reflection', 'delegation', 'message-passing', 'python', 'python-3.x', 'annotations', 'lint', 'type-hinting', 'python', 'pdf', 'python-3.7', 'pypdf', 'pdf-extraction', 'python', 'selenium', 'selenium-webdriver', 'xpath', 'webdriverwait', 'python', 'macos', 'python-3.x', 'sublimetext2', 'sublimetext', 'python-3.x', 'pdf', 'text', 'extract', 'pdfminer', 'python', 'documentation', 'python-3.7', 'docstring', 'python-dataclasses', 'python', 'text', 'stemming', 'plural', 'singular', 'nlp', 'cluster-analysis', 'data-mining',

In [11]:
exemple1 = test['title_nltk'][0]
print(exemple1)
# Call the function with your DataFrame and the desired text feature and target tags
predicted_tags1 = predict_tags_using_knn(train, 'title_nltk', 'all_tags', exemple=exemple1)
print(predicted_tags1, '\n')

# firebase peut etre predit
# grand succes !


['find', 'class', 'com', 'google', 'firebase', 'provider']
['java', 'spring', 'rest', 'gradle', 'spring-boot', 'java', 'android', 'firebase', 'gradle', 'android-gradle-plugin', 'android', 'android-studio', 'firebase', 'android-gradle-plugin', 'google-play-services', 'android', 'android-intent', 'arraylist', 'unmarshalling', 'parcelable', 'php', 'class', 'laravel', 'alias', 'autoloader', 'php', 'sql', 'laravel', 'laravel-5', 'laravel-artisan', 'java', 'spring', 'spring-boot', 'spring-security', 'spring-security-oauth2', 'java', 'maven', 'maven-2', 'maven-3', 'protocol-buffers', 'android', 'google-maps', 'dictionary', 'android-mapview', 'inflate', 'android', 'android-studio', 'flutter', 'sdk', 'android-sdk-manager', 'java', 'spring', 'junit', 'spring-boot', 'spring-data', 'php', 'laravel', 'https', 'laravel-valet', 'valet', 'spring', 'maven', 'spring-mvc', 'spring-boot', 'spring-profiles', 'json', 'angular', 'typescript', 'jwt', 'guard', 'c#', '.net', 'visual-studio-2012', 'compression',

In [12]:
exemple2 = test['title_nltk'][1]
print(exemple2)
# Call the function with your DataFrame and the desired text feature and target tags
predicted_tags2 = predict_tags_using_knn(train, 'title_nltk', 'all_tags', exemple=exemple2)
print(predicted_tags2, '\n')

# scale ok


['get', 'lang', 'noclassdeffounderror', 'scala', 'run', 'code']
['c#', '.net', 'wpf', 'code-behind', 'itemspanel', 'java', 'gradle', 'spring-boot', 'jar', 'build.gradle', 'reactjs', 'authentication', 'google-authentication', 'google-api-js-client', 'googleauthr', 'c', 'gcc', 'types', 'openmp', 'typeof', 'php', 'mysql', 'laravel', 'ubuntu', 'server', 'iphone', 'ios', 'xamarin.ios', 'http-response-codes', 'nsurlconnectiondelegate', 'c++', 'python', 'ctypes', 'cython', 'boost-python', 'scala', 'maven', 'apache-spark', 'noclassdeffounderror', 'spark-streaming', 'macos', 'shell', 'scala', 'terminal', 'installation', 'javascript', 'vue.js', 'visual-studio-code', 'nuxt.js', 'prettier', 'c#', 'multithreading', 'winforms', 'backgroundworker', 'infinite-loop', 'asp.net-core', 'oauth', 'identityserver4', 'openid-connect', 'asp.net-core-3.0', 'c++', 'c', 'cuda', 'parallel-processing', 'gpu', 'javascript', 'node.js', 'express', 'firebase', 'firebase-realtime-database', 'python', 'selenium', 'http',

#### scores


In [None]:
# Notre objectif de prédiction de tags ressemble a un pb de classification multi-label,
# où la matrice de confusion est extrêmement déséquilibrée :
# 5 tags sont prédits positifs, contre environ 250 000 tags (si on travaille sur all_tags)
# predits negatifs. Autrement dit :

# On peut utiliser la precision pour évaluer notre modèle. C'est même exactement l'outil qu'il nous faut :
# "précision = la proportion de prédictions correctes parmi les points que l’on a prédits positifs."
# En + c de loin le plus léger en ressources, puisqu'il ne s'occupe que des 5 tags prédits.

# En revanche je pense que le recall n'a pas vraiment de sens ici, il sera "écrasé" par
# le nombre de tags predits negatifs, sa valeur sera tjs très proche de zero.
# (même remarque pour la spécificité et l'accuracy)
# Et sans recall, pas de f1 score.
# à vérifier


In [13]:
def precision_topics(real_tags:list, predicted_tags:list): # pour comparer 2 listes
    # precision = TP / (TP + FP)
    tp = 0
    for predicted_tag in predicted_tags:
        if predicted_tag in real_tags:
            tp += 1

    fp = len(predicted_tags) - tp
    precision = tp/(tp + fp)
    # <=> precision = tp/len(predicted_tags)

    return precision


print(precision_topics(exemple1, predicted_tags1))
precision_topics(exemple2, predicted_tags2)

# ok


0.1


0.1

In [14]:
def precision_score(y_true, y_pred): # pour comparer 2 df ou 2 matrices de mm shape[0]
    precision = 0
    for i in range(0, len(y_pred)):
        precision += precision_topics(y_true[i], y_pred[i])
    precision_moyenne = precision / len(y_pred)

    return precision_moyenne

custom_precision_scorer = make_scorer(precision_score, greater_is_better=True)


In [15]:
def recall_topics(all_tags: list, predicted_tags: list):
    # recall = TP / (TP + FN)
    tp = 0
    fn = 0
    for real_tag in all_tags:
        if real_tag in predicted_tags:
            tp += 1
        else:
            fn += 1
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    return recall


def f1_topics(real_tags: list, predicted_tags: list, all_tags:list):
    precision = precision_score(real_tags, predicted_tags)
    recall = recall_topics(all_tags, predicted_tags)

    # F1 score = 2 * (precision * recall) / (precision + recall)
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1


def compute_score(y_true, y_pred, scorer=precision_score):
    score = 0
    for i in range(0, len(y_pred)):
        score += scorer(y_true[i], y_pred[i])
    score_moyen = score / len(y_pred)

    return score_moyen


def score_tag_accuracy(real_tags: list, predicted_tags: list):
    # accuracy = (TP + TN) / (TP + TN + FP + FN)
    tp = sum(1 for tag in predicted_tags if tag in real_tags)
    fp = sum(1 for tag in predicted_tags if tag not in real_tags)
    fn = sum(1 for tag in real_tags if tag not in predicted_tags)

    accuracy = (tp + fn) / (tp + fp + fn) if (tp + fp + fn) > 0 else 0

    return accuracy

# On va probablement utiliser la precision


In [19]:
# add grid search cv
# add recall, f1 score ?

def predict_tags_using_knn(train_df=train, feature='title_nltk', target='all_tags', test_df=test, k=50):
    # 1 PREPROCESSING
    documents = train_df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    print(len(gensim_dictionary))
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]
    print(len(corpus))
    # taille corpus ?

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T
    # taille matrice ? afficher
    print(dense_matrix.shape, '\n')
    # pas tres dense ici, c notre bow donc tres sparse en fait
    # curieux d'appeler "corpus2dense()" une fonction qui retourne une matrice sparse
    pprint(dense_matrix[:10]) # vraiment tres dense, quasiment que des 0 ! Bref
    print('\n')

    # Ici on ne va pas demander au knn de faire de prediction,
    # On veut juste qu'il trouve les voisins.
    # Mais la fonction fit a besoin de targets en param
    target_values = train_df[target].values

    # 2 MODEL TRAINING
    knn_model = KNeighborsRegressor(n_neighbors=k, metric='cosine', algorithm='brute')
    knn_model.fit(dense_matrix, target_values)

    # 3 PREDICTION
    # Predictions completes en 1h ou 2
    # optimiser avec pandarallel ?
    # use a sample en attendant
    predictions=[]
    min_range=3000
    max_range=3005 # test.shape[0]=4767
    for i in range(min_range, max_range):
        query_document = test_df[feature][i]
        print(f'doc {i} : {query_document}')
        print(f'real tags : {test[target][i]}')
        query_bow = gensim_dictionary.doc2bow(query_document)
        query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

        # Find nearest neighbors
        _, indices = knn_model.kneighbors(query_vector)

        # Aggregate tags from neighbors
        neighbor_tags = [tag for i in indices.flatten() for tag in train_df.iloc[i][target]]

        # Predict tags based on most common tags among neighbors
        predicted_tags = [tag for tag, _ in Counter(neighbor_tags).most_common(n=5)]
        # 5 tags/question en moyenne mais on peut suggérer +
        predictions.append(predicted_tags)
        print(f'predicted : {predicted_tags}')
        print(precision_topics(test_df[target][i], predicted_tags), '\n')

    true_tags = [tags for tags in test_df[target][min_range:max_range]]

    mean_precision = precision_score(true_tags, predictions)
    precis_moy = compute_score(true_tags, predictions, scorer=precision_topics)
    print(f'precision moyenne = {mean_precision}')
    print(f'precision moyenne = {precis_moy}')

    return predictions


# Call the function with your DataFrame and the desired text feature and target tags
predict_tags_using_knn()

# 0.34 de precision ?? c enorme !


5869
42898
(42898, 5869) 

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)


doc 3000 : ['tell', 'cmake', 'clang', 'window']
real tags : ['c++', 'windows', 'build', 'cmake', 'clang']
predicted : ['c++', 'cmake', 'windows', 'clang', 'c']
0.8 

doc 3001 : ['round', 'corner']
real tags : ['android', 'material-design', 'bottom-sheet', 'material-components-android', 'material-components']
predicted : ['android', 'ios', 'rounding', 'android-layout', 'swift']
0.2 

doc 3002 : ['future', 'cause', 'memory', 'leak']
real tags : ['java', 'android', 'multithreading', 'memory-leaks', 'future']
predicted : ['memory-leaks', 'memory', 'ios', 'javascript', 'c#']
0.2 

doc 3003 : ['pytorch', 'work']
real tags : ['python', 'numpy', 'linear-algebra', 'pytorch', 'array-broadcasting']
predicted : ['java

In [16]:
# test

from sklearn.preprocessing import MultiLabelBinarizer

def predict_tags_using_rf(train_df=train[::100], feature='title_nltk', target='all_tags', test_df=test, n_estimators=50):
    documents = train_df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    # Convert multi-label tags into binary format
    mlb = MultiLabelBinarizer()
    y_encoded = mlb.fit_transform(train_df[target])

    # Fit Random Forest model
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    rf_model.fit(dense_matrix, y_encoded)

    # Predictions
    predictions = []
    min_range = 3000
    max_range = 3005
    for i in range(min_range, max_range):
        query_document = test_df[feature][i]
        print(f'doc {i} : {query_document}')
        print(f'real tags : {test[target][i]}')
        query_bow = gensim_dictionary.doc2bow(query_document)
        query_vector = corpus2dense([query_bow], num_terms=len(gensim_dictionary)).T

        # Predict tags using Random Forest
        prediction_encoded = rf_model.predict(query_vector.reshape(1, -1))

        # Convert back to original tag format
        predicted_tags = mlb.inverse_transform(prediction_encoded.reshape(1, -1))

        predictions.append(predicted_tags)
        print(f'predicted : {predictions[-1]}', '\n')

    return predictions

# Call the function with your DataFrame and the desired text feature and target tags
predictions_rf = predict_tags_using_rf()


doc 3000 : ['tell', 'cmake', 'clang', 'window']
real tags : ['c++', 'windows', 'build', 'cmake', 'clang']


ValueError: Expected only 0s and 1s in label indicator. Also got [0.02 0.04 0.06 0.08 0.1  0.12 0.14 0.2  0.34 0.36 0.52]

In [None]:
# pourquoi on ne peut pas utiliser le score precision sckikit ici :
from sklearn.metrics import precision_score

# Assuming y_true is the ground truth (real tags) and y_pred is the predicted tags
precision = precision_score(['ok', 'ko'], ['ko', 'ok'], average='micro')  # You can use 'micro', 'macro', or 'weighted' depending on your use case
print(f'Precision: {precision}')


In [None]:
# Assuming y_true is the ground truth (real tags) and y_pred is the predicted tags
precision = precision_score([1, 0], [0, 1], average='micro')  # You can use 'micro', 'macro', or 'weighted' depending on your use case

print(f'Precision: {precision}')

### opti


In [None]:
# Define the plotting function
def plot_performance_vs_neighbors(grid_search):
    # Extract the results from the GridSearchCV object
    results = grid_search.cv_results_

    # Extract the parameters and scores for both uniform and distance weights
    params_uniform = [param for param in results['params'][::2]]
    params_distance = [param for param in results['params'][1::2]]
    test_scores_uniform = results['mean_test_r2'][::2]
    test_scores_distance = results['mean_test_r2'][1::2]

    # Extract the parameter values for uniform and distance weights
    n_neighbors_uniform = [param['knn_regressor__n_neighbors'] for param in params_uniform]
    n_neighbors_distance = [param['knn_regressor__n_neighbors'] for param in params_distance]

    # Create separate plots for uniform and distance weights
    plt.figure(figsize=(12, 6))

    # Plot for uniform weight
    plt.subplot(1, 2, 1)
    plt.plot(n_neighbors_uniform, test_scores_uniform, marker='o', linestyle='-')
    plt.title("Performance vs. Number of Neighbors (Uniform Weight)")
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Mean Test R-squared")
    plt.grid(True)

    # Plot for distance weight
    plt.subplot(1, 2, 2)
    plt.plot(n_neighbors_distance, test_scores_distance, marker='o', linestyle='-')
    plt.title("Performance vs. Number of Neighbors (Distance Weight)")
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Mean Test R-squared")
    plt.grid(True)

    plt.tight_layout()
    plt.show()


def pipe_knn(df=train, feature='title_nltk', target='all_tags', embedding='bow_dense', metric='cosine', graph=True):

    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Convert Gensim corpus to dense matrix
    dense_matrix = corpus2dense(corpus, num_terms=len(gensim_dictionary)).T

    # Ici on ne va pas demander au knn de faire de prediction,
    # On veut juste qu'il trouve les voisins.
    # Mais la fonction fit a besoin de targets en param
    target_values = df[target].values

    # Create a KNN Regressor
    knn_regressor = KNeighborsRegressor(metric=metric)

    # Create a pipeline with preprocessing and a knn regressor, to simplify gridsearch
    pipe = Pipeline(steps=[
        ("knn_regressor", knn_regressor)
    ])

    # Define hyperparameters and their possible values for grid search
    param_grid = {
        'knn_regressor__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'knn_regressor__weights': ['uniform', 'distance']
    }

    # Create the GridSearchCV object with multiple scoring metrics
    # scoring = {'neg_mean_squared_error': 'neg_mean_squared_error', 'r2': 'r2'}
    grid_search = GridSearchCV(pipe, param_grid=param_grid,
                            scoring=make_scorer(score_tag_f1), cv=5, verbose=1) # add, refit='f1' for multiple scoring

    # Fit the GridSearchCV object to your training data to perform hyperparameter tuning
    grid_search.fit(dense_matrix, target_values)

    # Access the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    # Create the KNN regressor with the best hyperparameters
    best_knn_regressor = KNeighborsRegressor(metric=metric,
                                             n_neighbors=best_params['knn_regressor__n_neighbors'],
                                             weights=best_params['knn_regressor__weights'])

    # Create a pipeline with the preprocessor and the tuned knn regressor
    pipeline_with_tuned_knn = Pipeline(steps=[
        ("knn_regressor", best_knn_regressor)  # Use the tuned neighbor and weight values here
    ])

    # Perform cross-validation (on training set) and display the scores for each split
    # scoring = ['r2', 'neg_mean_squared_error']
    cv_scores = cross_validate(pipeline_with_tuned_knn, dense_matrix, target_values, cv=5, scoring=make_scorer(score_tag_f1))
    # print("Cross-Validation Scores (training):", '\n', cv_scores)
    print("Cross-Validation Scores:")
    pprint(cv_scores)


pipe_knn()


In [None]:
    for i, score in enumerate(cv_scores['test_r2']):
        print(f"Split {i+1} : r2 = {score}")

    r2_val = cv_scores['test_r2'].mean()
    mse_val = -cv_scores['test_neg_mean_squared_error'].mean()
    rmse_val = np.sqrt(mse_val)

    # fit model on training set
    pipeline_with_tuned_knn.fit(dense_matrix, target_values)
    # Make no predictions here

    # Calculate scores on training
    f1 = score_tag_f1(train['all_tags'], pipeline_with_tuned_knn.predict(dense_matrix))
    # and testing set
    r2_test, rmse_test = score_tag_f1(y_test, y_pred)

    # Display results
    print(f"R-squared (val) =  {r2_val}")
    print(f"R-squared (train) =  {r2_train}")
    print(f"R-squared (test) =  {r2_test}")
    print(f"RMSE (val) =  {rmse_val}")
    print(f"RMSE (train) =  {rmse_train}")
    print(f"RMSE (test) =  {rmse_test}" '\n')

    # display results/error as graph on first iteration (if asked to)
    if alea == 0 and graph:
        plot_performance_vs_neighbors(grid_search)
        plot_predictions(r2_train, r2_test, y_pred, y_test, kind='actual_vs_predicted', y=y)
        # plot_predictions(r2_train, r2_test, y_pred, y_test, kind='residual_vs_predicted', y=y)

    # Return scores for this random state
    return r2_val, rmse_val, r2_train, rmse_train, r2_test, rmse_test, time_fit, time_predict


def test_knn_n_times(y=y_E, scaler=robust, graph=False, metric='euclidean'):
    print(f'Modèle : kNN')
    print('target : ', y.name)

    results_r2_val, results_rmse_val,  results_r2_train, results_rmse_train = [], [], [], []
    results_r2_test, results_rmse_test, results_time_fit, results_time_predict = [], [], [], []

    for n in range(nb_iter):
        print('Iteration ', n+1)
        r2_val, rmse_val, r2_train, rmse_train, r2_test, rmse_test, time_fit, time_predict = pipe_knn(alea=n,
                                                                                      y=y,
                                                                                      scaler=scaler,
                                                                                      graph=graph,
                                                                                      metric=metric)
        results_r2_val.append(r2_val)
        results_rmse_val.append(rmse_val)
        results_r2_train.append(r2_train)
        results_rmse_train.append(rmse_train)
        results_r2_test.append(r2_test)
        results_rmse_test.append(rmse_test)
        results_time_fit.append(time_fit)
        results_time_predict.append(time_predict)

    # Calculate means and std devs
    r2_val_moy = np.mean(results_r2_val)
    rmse_val_moy = np.mean(results_rmse_val)
    r2_train_moy = np.mean(results_r2_train)
    rmse_train_moy = np.mean(results_rmse_train)
    r2_test_moy = np.mean(results_r2_test)
    rmse_test_moy = np.mean(results_rmse_test)
    time_fit_moy = np.mean(results_time_fit)
    time_predict_moy = np.mean(results_time_predict)

    r2_val_std = np.std(results_r2_val)
    rmse_val_std = np.std(results_rmse_val)
    r2_train_std = np.std(results_r2_train)
    rmse_train_std = np.std(results_rmse_train)
    r2_test_std = np.std(results_r2_test)
    rmse_test_std = np.std(results_rmse_test)
    time_fit_std = np.std(results_time_fit)
    time_predict_std = np.std(results_time_predict)

    # Mise en forme
    results = {'model': 'kNN',
               'set': dataset,
               'scaler': scaler,
               'target': y.name,
               'r2_test_moy': r2_test_moy,
               'r2_test_std': r2_test_std,
               'rmse_test_moy': rmse_test_moy,
               'rmse_test_std': rmse_test_std,
               'r2_train_moy': r2_train_moy,
               'r2_train_std': r2_train_std,
               'rmse_train_moy': rmse_train_moy,
               'rmse_train_std': rmse_train_std,
               'r2_val_moy': r2_val_moy,
               'r2_val_std': r2_val_std,
               'rmse_val_moy': rmse_val_moy,
               'rmse_val_std': rmse_val_std,
               'time_fit_moy': time_fit_moy,
               'time_fit_std': time_fit_std,
               'time_predict_moy': time_predict_moy,
               'time_predict_std': time_predict_std,
               }

    print(results, '\n')

    # Append a new row for this model
    model_results.append(results)

test_knn_n_times(scaler=robust, graph=True)
# test_knn_n_times(y=y_EI, scaler=robust, graph=True)

affichage_results()

# 0.4, ce qui est bien, mais pas top.
# dataset trop petit pour un knn ? (relativement peu d'individus)


In [None]:

def suggest_topics_using_knn(df, feature, alea=42):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
    id2word = gensim_dictionary.id2token

    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every, random_state=alea)

    top_topics = model.top_topics(corpus, topn=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    # = umass if same topn (default 20)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    # Compute Coherence Score (Umass)
    coherence_umass = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='u_mass')
    coherence_lda_umass = coherence_umass.get_coherence()
    print('u_mass Coherence Score: %.4f.' % coherence_lda_umass)

    # Compute Coherence Score (cv)
    coherence_cv = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_v')
    coherence_lda_cv = coherence_cv.get_coherence()
    print('c_v Coherence Score: %.4f.' % coherence_lda_cv)

    # Compute Coherence Score (npmi)
    coherence_npmi = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_npmi')
    coherence_lda_npmi = coherence_npmi.get_coherence()
    print('c_npmi Coherence Score: %.4f.' % coherence_lda_npmi)

    # Perplexity is not a coherence score but a measure of how well the model predicts a sample.
    # A lower perplexity indicates better model performance.
    perplexity = model.log_perplexity(corpus)
    print('Perplexity: %.4f.' % perplexity)

    # Visualize the topics
    vis_data = gensimvis.prepare(model, corpus, gensim_dictionary)
    display(pyLDAvis.display(vis_data))

    # Uncomment the next line if you want to save the plot to a file
    # pyLDAvis.save_html(vis_data, 'artifacts/lda_vis.html')

    pprint(top_topics)
    # to print all topics
    # pprint(model.print_topics())

    return model, corpus, gensim_dictionary

lda_test, corpus_test, dict_test = suggest_topics_using_LDA(train, 'title_nltk')

In [None]:
# Define the plotting function
def plot_performance_vs_neighbors(grid_search):
    # Extract the results from the GridSearchCV object
    results = grid_search.cv_results_

    # Extract the parameters and scores for both uniform and distance weights
    params_uniform = [param for param in results['params'][::2]]
    params_distance = [param for param in results['params'][1::2]]
    test_scores_uniform = results['mean_test_r2'][::2]
    test_scores_distance = results['mean_test_r2'][1::2]

    # Extract the parameter values for uniform and distance weights
    n_neighbors_uniform = [param['knn_regressor__n_neighbors'] for param in params_uniform]
    n_neighbors_distance = [param['knn_regressor__n_neighbors'] for param in params_distance]

    # Create separate plots for uniform and distance weights
    plt.figure(figsize=(12, 6))

    # Plot for uniform weight
    plt.subplot(1, 2, 1)
    plt.plot(n_neighbors_uniform, test_scores_uniform, marker='o', linestyle='-')
    plt.title("Performance vs. Number of Neighbors (Uniform Weight)")
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Mean Test R-squared")
    plt.grid(True)

    # Plot for distance weight
    plt.subplot(1, 2, 2)
    plt.plot(n_neighbors_distance, test_scores_distance, marker='o', linestyle='-')
    plt.title("Performance vs. Number of Neighbors (Distance Weight)")
    plt.xlabel("Number of Neighbors")
    plt.ylabel("Mean Test R-squared")
    plt.grid(True)

    plt.tight_layout()
    plt.show()


def pipe_knn(alea, y, scaler, graph, metric):

    # Séparation des jeux de données entrainement / validation, preprocessing
    X_train, X_test, y_train, y_test, preprocessor = preprocessing(y, alea=alea, test_size=test_size, \
                                                                    scaler=scaler)
    # Create a KNN Regressor
    knn_regressor = KNeighborsRegressor(metric=metric)

    # Create a pipeline with preprocessing and a knn regressor
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("knn_regressor", knn_regressor)
    ])

    # Define hyperparameters and their possible values for grid search
    param_grid = {
        'knn_regressor__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'knn_regressor__weights': ['uniform', 'distance']
    }

    # Create the GridSearchCV object with multiple scoring metrics
    scoring = {'neg_mean_squared_error': 'neg_mean_squared_error', 'r2': 'r2'}
    grid_search = GridSearchCV(pipe, param_grid=param_grid,
                            scoring=scoring, cv=5, refit='r2', verbose=1)

    # Fit the GridSearchCV object to your training data to perform hyperparameter tuning
    grid_search.fit(X_train, y_train)

    # Access the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    # Create the KNN regressor with the best hyperparameters
    best_knn_regressor = KNeighborsRegressor(metric=metric,
                                             n_neighbors=best_params['knn_regressor__n_neighbors'],
                                             weights=best_params['knn_regressor__weights'])

    # Create a pipeline with the preprocessor and the tuned knn regressor
    pipeline_with_tuned_knn = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("knn_regressor", best_knn_regressor)  # Use the tuned neighbor and weight values here
    ])

    # Perform cross-validation (on training set) and display the scores for each split
    scoring = ['r2', 'neg_mean_squared_error']
    cv_scores = cross_validate(pipeline_with_tuned_knn, X_train, y_train, cv=5, scoring=scoring)
    # print("Cross-Validation Scores (training):", '\n', cv_scores)
    print("Cross-Validation Scores:")
    for i, score in enumerate(cv_scores['test_r2']):
        print(f"Split {i+1} : r2 = {score}")

    r2_val = cv_scores['test_r2'].mean()
    mse_val = -cv_scores['test_neg_mean_squared_error'].mean()
    rmse_val = np.sqrt(mse_val)

    # fit model on training set
    time_fit = fit_and_timeit(pipeline_with_tuned_knn, X_train, y_train)
    # Make predictions
    y_pred, time_predict = predict_and_timeit(pipeline_with_tuned_knn, X_test)

    # Calculate scores on training
    r2_train, rmse_train = calcul_scores(y_train, pipeline_with_tuned_knn.predict(X_train))
    # and testing set
    r2_test, rmse_test = calcul_scores(y_test, y_pred)

    # Display results
    print(f"R-squared (val) =  {r2_val}")
    print(f"R-squared (train) =  {r2_train}")
    print(f"R-squared (test) =  {r2_test}")
    print(f"RMSE (val) =  {rmse_val}")
    print(f"RMSE (train) =  {rmse_train}")
    print(f"RMSE (test) =  {rmse_test}" '\n')

    # display results/error as graph on first iteration (if asked to)
    if alea == 0 and graph:
        plot_performance_vs_neighbors(grid_search)
        plot_predictions(r2_train, r2_test, y_pred, y_test, kind='actual_vs_predicted', y=y)
        # plot_predictions(r2_train, r2_test, y_pred, y_test, kind='residual_vs_predicted', y=y)

    # Return scores for this random state
    return r2_val, rmse_val, r2_train, rmse_train, r2_test, rmse_test, time_fit, time_predict


## Deep learning Models
