# **Catégorisez automatiquement des questions**

## partie 3/8 : Prédiction de tags, approche non-supervisée

### <br> Proposition de mots clés, de type LDA avec visualisation en 2D des topics

<br>


## Importation des librairies, réglages


In [128]:
import os, sys, random
import ast
# from zipfile import ZipFile
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from pprint import pprint

# Visualisation
import matplotlib.pyplot as plt
# import seaborn as sns
import plotly.express as px

# Feature engineering
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.models.ldamulticore import LdaMulticore

from sklearn.model_selection import GridSearchCV

print('\nPython version ' + sys.version)
print('pyLDAvis version ' + pyLDAvis.__version__)

# Modify if necessary
num_cores = os.cpu_count()
print(f"\nNumber of CPU cores: {num_cores}")
pandarallel.initialize(progress_bar=False, nb_workers=6)



Python version 3.11.4 (main, Jul  5 2023, 14:15:25) [GCC 11.2.0]
pyLDAvis version 3.4.0

Number of CPU cores: 8
INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Fonctions


In [129]:
def get_missing_values(df):
    """Generates a DataFrame containing the count and proportion of missing values for each feature.

    Args:
        df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
        pandas.DataFrame: A DataFrame with columns for the feature name, count of missing values,
        count of non-missing values, proportion of missing values, and data type for each feature.
    """
    # Count the missing values for each column
    missing = df.isna().sum()

    # Calculate the percentage of missing values
    percent_missing = df.isna().mean() * 100

    # Create a DataFrame to store the results
    missings_df = pd.DataFrame({
        'column_name': df.columns,
        'missing': missing,
        'present': df.shape[0] - missing,  # Count of non-missing values
        'percent_missing': percent_missing.round(2),  # Rounded to 2 decimal places
        'type': df.dtypes
    })

    # Sort the DataFrame by the count of missing values
    missings_df.sort_values('missing', inplace=True)

    return missings_df

# with pd.option_context('display.max_rows', 1000):
#   display(get_missing_values(df))


def quick_look(df, miss=True):
    """
    Display a quick overview of a DataFrame, including shape, head, tail, unique values, and duplicates.

    Args:
        df (pandas.DataFrame): The input DataFrame to inspect.
        check_missing (bool, optional): Whether to check and display missing values (default is True).

    The function provides a summary of the DataFrame, including its shape, the first and last rows, the count of unique values per column, and the number of duplicates.
    If `check_missing` is set to True, it also displays missing value information.
    """
    print(f'shape : {df.shape}')

    display(df.head())
    display(df.tail())

    print('uniques :')
    display(df.nunique())

    print('Doublons ? ', df.duplicated(keep='first').sum(), '\n')

    if miss:
        display(get_missing_values(df))



### import


In [130]:
# import

train = pd.read_csv('./../data/cleaned_data/train_bow_uniques.csv', sep=',')
test = pd.read_csv('./../data/cleaned_data/test_bow_uniques.csv', sep=',')

quick_look(train)


shape : (42898, 8)


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
0,2019-06-05 15:13:02,How to use memset while handling strings in C++?,I am from Python background and recently learn...,"['c++', 'initialization', 'c-strings', 'string...","['memset', 'handle', 'string']","['memset', 'handle', 'string', 'python', 'back...","['use', 'memset', 'handle', 'string']","['background', 'learn', 'function', 'memset', ..."
1,2018-10-31 12:35:02,How to correct spelling in google docs using k...,I would like to be able to replace a misspelle...,"['gmail', 'keyboard-shortcuts', 'google-docs',...","['correct', 'spell', 'google', 'doc', 'keyboar...","['correct', 'spell', 'google', 'doc', 'shortcu...","['correct', 'spelling', 'keyboard', 'shortcut']","['like', 'replace', 'word', 'recommend', 'corr..."
2,2020-09-19 10:40:23,live server vscode on another computer,I have 2 computers. when I open the project wi...,"['visual-studio-code', 'server', 'localhost', ...","['server', 'vscode', 'computer']","['server', 'vscode', 'computer', 'open', 'proj...","['server', 'vscode', 'computer']","['computer', 'open', 'project', 'server', 'url..."
3,2012-10-23 16:47:04,django ajax post 403 forbidden,using django 1.4 im getting a 403 error when i...,"['javascript', 'ajax', 'django', 'http-post', ...","['django', 'ajax', 'forbidden']","['django', 'ajax', 'get', 'error', 'try', 'pos...",['forbid'],"['django', 'error', 'try', 'post', 'javascript..."
4,2019-04-21 16:10:24,Listen to changes and reload container on code...,I am using docker-compose in visual studio 201...,"['angular', 'visual-studio', 'docker', 'docker...","['listen', 'change', 'reload', 'container', 'c...","['listen', 'change', 'reload', 'container', 'c...","['listen', 'change', 'reload', 'container', 'c...","['docker', 'compose', 'studio', 'window', 'run..."


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
42893,2017-02-23 11:34:31,Do we need clear MDC after HTTP request in Spring,According to this answer thread local variable...,"['java', 'spring', 'logging', 'log4j', 'logback']","['need', 'mdc', 'request', 'spring']","['need', 'mdc', 'request', 'spring', 'accord',...","['need', 'request']","['accord', 'answer', 'thread', 'variable', 'us..."
42894,2011-10-13 20:57:32,How to make i18n with Handlebars.js (mustache ...,I'm currently using Handlebars.js (associated ...,"['javascript', 'jquery', 'internationalization...","['make', 'i18n', 'handlebar', 'template']","['make', 'i18n', 'handlebar', 'template', 'ass...",['template'],"['associate', 'web', 'app', 'client', 'render'..."
42895,2012-09-06 00:16:46,How can I make R read my environmental variables?,I am running R on EC2 spot instances and I nee...,"['linux', 'r', 'ubuntu', 'amazon-ec2', 'enviro...","['make', 'read', 'variable']","['make', 'read', 'variable', 'run', 'spot', 'i...","['read', 'variable']","['run', 'spot', 'instance', 'need', 'terminate..."
42896,2021-03-23 03:50:50,How to prevent react-query from fetching initi...,I'm using react-query v3.13 to fetch data from...,"['javascript', 'reactjs', 'fetch', 'react-quer...","['prevent', 'query', 'fetch', 'enable']","['prevent', 'query', 'fetch', 'enable', 'data'...","['prevent', 'react', 'query', 'fetch', 'enable']","['react', 'query', 'fetch', 'datum', 'want', '..."
42897,2016-03-17 04:19:15,Inserting into table with an Identity column w...,I have a table A_tbl in my database. I have cr...,"['sql', 'sql-server', 'database', 'ssms', 'dat...","['insert', 'table', 'identity', 'column', 'rep...","['insert', 'table', 'identity', 'column', 'rep...","['insert', 'table', 'column', 'replication', '...","['table', 'database', 'create', 'trigger', 'ca..."


uniques :


CreationDate    42893
title           42897
body            42898
all_tags        41513
title_nltk      42171
body_nltk       42898
title_spacy     37346
body_spacy      42891
dtype: int64

Doublons ?  0 



Unnamed: 0,column_name,missing,present,percent_missing,type
CreationDate,CreationDate,0,42898,0.0,object
title,title,0,42898,0.0,object
body,body,0,42898,0.0,object
all_tags,all_tags,0,42898,0.0,object
title_nltk,title_nltk,0,42898,0.0,object
body_nltk,body_nltk,0,42898,0.0,object
title_spacy,title_spacy,0,42898,0.0,object
body_spacy,body_spacy,0,42898,0.0,object


In [131]:
# Tt semble OK, sauf les types : le fait d'exporter nos données a converti nos listes de tokens en str

def turn_str_back_into_list(df):
    """Correct the type change due to .csv export"""

    df['title_nltk'] = df['title_nltk'].apply(ast.literal_eval)
    df['body_nltk'] = df['body_nltk'].apply(ast.literal_eval)
    df['title_spacy'] = df['title_spacy'].apply(ast.literal_eval)
    df['body_spacy'] = df['body_spacy'].apply(ast.literal_eval)


turn_str_back_into_list(train)
turn_str_back_into_list(test)


In [132]:
# Vérif
train[['title_nltk', 'body_nltk', 'title_spacy', 'body_spacy']].map(len).describe()

# OK, pas de liste vide (min = 1)


Unnamed: 0,title_nltk,body_nltk,title_spacy,body_spacy
count,42898.0,42898.0,42898.0,42898.0
mean,4.409903,39.608653,3.311413,29.030444
std,1.732934,27.99795,1.619366,20.274162
min,1.0,2.0,1.0,1.0
25%,3.0,21.0,2.0,16.0
50%,4.0,32.0,3.0,24.0
75%,5.0,50.0,4.0,37.0
max,14.0,368.0,12.0,307.0


In [133]:
# quick_look(test)


In [134]:
test[['title_nltk', 'body_nltk', 'title_spacy', 'body_spacy']].map(len).describe()
# OK


Unnamed: 0,title_nltk,body_nltk,title_spacy,body_spacy
count,4767.0,4767.0,4767.0,4767.0
mean,4.395427,39.790854,3.284665,29.20516
std,1.716085,27.418228,1.620959,19.973006
min,1.0,3.0,1.0,1.0
25%,3.0,21.0,2.0,16.0
50%,4.0,32.0,3.0,24.0
75%,5.0,51.0,4.0,37.0
max,13.0,268.0,12.0,194.0


In [135]:
# Utile si nos inputs st sous forme de string
# mais il semble qu'on va plutôt conserver la liste de tokens au final

def fix_false_null_values(df):
    """
    Replace NaN values in specified columns with the string 'null'.
    ! USE ONLY AFTER VERIFYING that the NaN values are all the "null" string
    """
    df.loc[(df['title_nltk'].isna()), 'title_nltk'] = 'null'
    df.loc[(df['title_spacy'].isna()), 'title_spacy'] = 'null'


# fix_false_null_values(train)
# fix_false_null_values(test)

# Check for null values in the entire DataFrame
# null_values = train[train.isnull().any(axis=1)]

# Print the rows with null values
# print(null_values)


In [136]:
# quick_look(train)
# quick_look(test)


In [137]:
index = [4532, 8280, 12992, 14957, 22934, 24964, 25950]

display(train.loc[train.index.isin(index), :])

# OK


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
4532,2013-10-23 22:23:31,How to change type of id in Microsoft.AspNet.I...,"(ASP.NET MVC 5, EF6, VS2013)\nI'm trying to fi...","['asp.net-mvc', 'entity-framework', 'asp.net-m...","[change, type, aspnet, identity, entityframewo...","[change, type, aspnet, identity, entityframewo...","[change, type, identity]","[try, figure, change, type, field, string, int..."
8280,2014-06-20 18:46:09,"Bootstrap form input: prevent submit, but allo...",I've got the following problem: \nI use bootst...,"['javascript', 'jquery', 'html', 'forms', 'twi...","[bootstrap, form, input, submit, allow, checking]","[bootstrap, form, input, submit, allow, check,...","[bootstrap, form, input, prevent, submit, allo...","[problem, use, bootstrap, form, input, user, p..."
12992,2017-08-21 19:46:31,PySpark: org.apache.spark.sql.AnalysisExceptio...,"I'm trying to load Parquet data into PySpark, ...","['python', 'apache-spark', 'pyspark', 'apache-...","[pyspark, apache, spark, attribute, name, cont...","[pyspark, apache, spark, attribute, name, cont...","[org.apache.spark.sql, analysisexception, cont...","[try, load, datum, column, space, aliase, erro..."
14957,2018-08-08 12:58:34,How to break ForEach Loop in TypeScript,"I have a the below code, on which i am unable ...","['javascript', 'angular', 'typescript', 'forea...","[break, loop, typescript]","[break, loop, typescript, code, condition, fun...",[break],"[code, break, loop, condition, function, let, ..."
22934,2014-11-26 18:26:05,Python: Creating a 2D histogram from a numpy m...,"I'm new to python.\nI have a numpy matrix, of ...","['python', 'numpy', 'matrix', 'matplotlib', 'h...","[python, create, histogram, matrix]","[python, create, histogram, matrix, dimension,...","[python, create, histogram, matrix]","[python, matrix, dimension, value, range, want..."
24964,2011-11-28 02:41:21,SSRS line chart not connecting data points,I've looked high and low and can't seem to fin...,"['join', 'reporting-services', 'graph', 'chart...","[line, chart, connect, data, point]","[line, chart, connect, data, point, look, seem...","[line, chart, connect, datum, point]","[look, find, answer, appear, issue, think, lin..."
25950,2014-08-21 15:58:49,GS1 barcode parsing,We need to parse the GS1 datamatrix barcode wh...,"['parsing', 'barcode', 'datamatrix', 'gs1-data...","[barcode, parse]","[barcode, parse, need, provide, party, know, l...","[barcode, parse]","[need, parse, barcode, provide, party, know, u..."


## LDA


In [139]:
# add random state
# add grid search cv
# add other score ? silhouette ? ...

def suggest_topics_using_LDA(df, feature):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
    id2word = gensim_dictionary.id2token

    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

    top_topics = model.top_topics(corpus, topn=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    # = umass if same topn (default 20)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    # Compute Coherence Score (Umass)
    coherence_umass = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='u_mass')
    coherence_lda_umass = coherence_umass.get_coherence()
    print('u_mass Coherence Score: %.4f.' % coherence_lda_umass)

    # Compute Coherence Score (cv)
    coherence_cv = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_v')
    coherence_lda_cv = coherence_cv.get_coherence()
    print('c_v Coherence Score: %.4f.' % coherence_lda_cv)

    # Compute Coherence Score (npmi)
    coherence_npmi = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_npmi')
    coherence_lda_npmi = coherence_npmi.get_coherence()
    print('c_npmi Coherence Score: %.4f.' % coherence_lda_npmi)

    # Perplexity is not a coherence score but a measure of how well the model predicts a sample.
    # A lower perplexity indicates better model performance.
    perplexity = model.log_perplexity(corpus)
    print('Perplexity: %.4f.' % perplexity)

    # Visualize the topics
    vis_data = gensimvis.prepare(model, corpus, gensim_dictionary)
    display(pyLDAvis.display(vis_data))

    # Uncomment the next line if you want to save the plot to a file
    # pyLDAvis.save_html(vis_data, 'lda_vis.html')

    pprint(top_topics)
    # to print all topics
    # pprint(model.print_topics())

    return model, corpus, gensim_dictionary

lda_test, corpus_test, dict_test = suggest_topics_using_LDA(train, 'title_nltk')


Average topic coherence: -8.3586.
u_mass Coherence Score: -8.3586.
c_v Coherence Score: 0.3042.
c_npmi Coherence Score: -0.0949.
Perplexity: -7.4200.


[([(0.07220536, 'error'),
   (0.058530744, 'get'),
   (0.040402133, 'find'),
   (0.03056213, 'create'),
   (0.028382905, 'run'),
   (0.025845522, 'app'),
   (0.022576876, 'fail'),
   (0.02179621, 'code'),
   (0.02045016, 'load'),
   (0.02014837, 'studio'),
   (0.020137051, 'server'),
   (0.018212207, 'project'),
   (0.018207887, 'application'),
   (0.017559664, 'access'),
   (0.016100815, 'build'),
   (0.014055084, 'version'),
   (0.013547914, 'support'),
   (0.011427413, 'system'),
   (0.011082655, 'user'),
   (0.010155425, 'miss')],
  -4.520601458275859),
 ([(0.058215145, 'value'),
   (0.054584067, 'java'),
   (0.043896865, 'type'),
   (0.04307946, 'array'),
   (0.040760152, 'class'),
   (0.032852013, 'return'),
   (0.032461934, 'name'),
   (0.022594512, 'property'),
   (0.020810073, 'web'),
   (0.02048269, 'parameter'),
   (0.019913206, 'field'),
   (0.019595291, 'html'),
   (0.019425811, 'mvc'),
   (0.018358838, 'default'),
   (0.01631978, 'exist'),
   (0.015777424, 'model'),
   (0

In [107]:
print(len(corpus_test)) # = nb documents (questions)
print(len(dict_test)) # = nb tokens uniques

# Get the Document-Topic Matrix
document_topic_matrix = lda_test.get_document_topics(corpus_test)

# Print the Document-Topic Matrix
for i, document_topics in enumerate(document_topic_matrix[:12]):
    print(f"Document {i + 1} Topics: {document_topics}")
print('\n')

# Get the Topic-Word Matrix
topic_word_matrix = lda_test.get_topics()

# Print the Topic-Word Matrix
for topic_id, topic_terms in enumerate(topic_word_matrix):
    term_weights = [(term_id, weight) for term_id, weight in enumerate(topic_terms)]
    terms_with_names = [(dict_test.id2token[term_id], weight) for term_id, weight in term_weights]
    print(f"Topic {topic_id + 1} Terms: {terms_with_names}")


42898
5869
Document 1 Topics: [(0, 0.075997844), (1, 0.07792867), (2, 0.15136762), (3, 0.084512234), (4, 0.119959995), (5, 0.12976977), (6, 0.0847143), (7, 0.057120122), (8, 0.15500535), (9, 0.06362413)]
Document 2 Topics: [(0, 0.061556272), (1, 0.063120194), (2, 0.059650153), (3, 0.19516957), (4, 0.097164474), (5, 0.10507323), (6, 0.19497249), (7, 0.04626581), (8, 0.1254939), (9, 0.05153388)]
Document 3 Topics: [(0, 0.075963065), (1, 0.07789302), (2, 0.22994411), (3, 0.08447358), (4, 0.119905196), (5, 0.051523104), (6, 0.08467556), (7, 0.057093974), (8, 0.07674269), (9, 0.14178568)]
Document 4 Topics: [(0, 0.07597034), (1, 0.23415965), (2, 0.073617876), (3, 0.084481664), (4, 0.119916655), (5, 0.051528037), (6, 0.08468366), (7, 0.057099443), (8, 0.15494159), (9, 0.06360109)]
Document 5 Topics: [(0, 0.11742597), (1, 0.059336707), (2, 0.11563816), (3, 0.0643496), (4, 0.27001745), (5, 0.098797075), (6, 0.06450346), (7, 0.10302624), (8, 0.058460418), (9, 0.048444882)]
Document 6 Topics: [(

In [109]:
# Du point de vue metier, des topics tres proches peuvent etre consideres
# comme etant un seul topic. Il est + interessant d'avoir des topics bien differents les uns des autres.

# Thought about using clustering scores like silhouette, davies-bouldin, etc...
# but lda is different from clustering, it returns not 1 label, but a distribution of probabilities over topics.
# so it's not possible to use davies-bouldin or silhouette directly to check if the topics are well separated.
# Instead, we can use a metric that evaluates the overlap between topics, such as the Jaccard similarity

topn = 1000

def jaccard_similarity(topic1, topic2):
    set1 = set(topic1)
    set2 = set(topic2)
    return len(set1.intersection(set2)) / len(set1.union(set2))

# Example
topic1 = lda_test.show_topic(0, topn=topn)
topn_tokens_topic1 = [token for token, proba in topic1]

print(topic1)
print(len(topic1))
print(topn_tokens_topic1)

topic2 = lda_test.show_topic(1, topn=topn)
topn_tokens_topic2 = [token for token, proba in topic2]

print(topic2)
print(len(topic2))
print(topn_tokens_topic2)

print('Jaccard Similarity between Topics 0 and 1:', jaccard_similarity(topn_tokens_topic1, topn_tokens_topic2))

# Pour avoir une idée sur l'ensemble des topics
def average_jaccard_similarity(lda_model):
    num_topics = lda_model.num_topics
    total_similarity = 0.0
    pair_count = 0

    # Iterate over all pairs of topics
    for i in range(num_topics):
        for j in range(i + 1, num_topics):
            # Get the word distributions for the two topics
            topic1 = lda_model.show_topic(i, topn=topn)
            topn_tokens_topic1 = [token for token, proba in topic1]
            topic2 = lda_model.show_topic(j, topn=topn)
            topn_tokens_topic2 = [token for token, proba in topic2]

            # Calculate Jaccard similarity and update total_similarity
            similarity = jaccard_similarity(topn_tokens_topic1, topn_tokens_topic2)
            total_similarity += similarity
            pair_count += 1

    # Calculate the average Jaccard similarity
    average_similarity = total_similarity / pair_count
    return average_similarity

# Example
average_jaccard = average_jaccard_similarity(lda_test)
print('Average Jaccard Similarity between Topics:', average_jaccard)


[('add', 0.053117048), ('type', 0.048620824), ('way', 0.037812106), ('time', 0.028602812), ('view', 0.02851899), ('check', 0.026769746), ('property', 0.02502605), ('framework', 0.022677254), ('date', 0.022103328), ('read', 0.021943085), ('number', 0.020790033), ('exist', 0.018076101), ('store', 0.017680496), ('rail', 0.017406613), ('entity', 0.017317098), ('controller', 0.016017385), ('format', 0.015952276), ('reference', 0.014051829), ('container', 0.014007286), ('permission', 0.013618819), ('result', 0.013099933), ('task', 0.012965313), ('try', 0.011593878), ('configure', 0.011477574), ('include', 0.011111038), ('loop', 0.011061658), ('order', 0.009542006), ('session', 0.009508294), ('context', 0.008687983), ('deny', 0.008362487), ('postgresql', 0.0077250963), ('shell', 0.007547215), ('await', 0.0072368146), ('layout', 0.0070470665), ('async', 0.0070025544), ('initialize', 0.0069130403), ('datetime', 0.006302098), ('collection', 0.006107371), ('take', 0.006029645), ('constraint', 0.0

In [110]:
suggest_topics_using_LDA(train, 'title_spacy')


Average topic coherence: -6.750.
u_mass Coherence Score:  -10.16633824377779
c_v Coherence Score:  0.3706316567879477
c_npmi Coherence Score:  -0.15556503451384734
Perplexity:  -7.238181704353146


[([(0.09738355, 'error'),
   (0.06117362, 'find'),
   (0.041564025, 'run'),
   (0.038138967, 'fail'),
   (0.0375891, 'image'),
   (0.037310176, 'app'),
   (0.023743771, 'module'),
   (0.023466282, 'project'),
   (0.023415556, 'build'),
   (0.02155394, 'install')],
  -3.7955337931138344),
 ([(0.067816064, 'add'),
   (0.0677302, 'create'),
   (0.047050953, 'request'),
   (0.034949705, 'application'),
   (0.03417181, 'api'),
   (0.031159606, 'web'),
   (0.029467322, 'custom'),
   (0.026690599, 'ios'),
   (0.024985425, 'send'),
   (0.024789847, 'header')],
  -4.2101548036221015),
 ([(0.07102129, 'value'),
   (0.06570805, 'string'),
   (0.05285111, 'datum'),
   (0.05014124, 'array'),
   (0.043080803, 'convert'),
   (0.027112655, 'access'),
   (0.025102159, 'user'),
   (0.024977805, 'pass'),
   (0.024706002, 'form'),
   (0.024397347, 'update')],
  -5.318303782452968),
 ([(0.066774145, 'change'),
   (0.05008423, 'way'),
   (0.040715273, 'text'),
   (0.038419962, 'element'),
   (0.032871775, '

(<gensim.models.ldamodel.LdaModel at 0x7f181afbd390>,
 [[(0, 1), (1, 1), (2, 1), (3, 1)],
  [(4, 1), (5, 1), (6, 1), (7, 1)],
  [(8, 1), (9, 1), (10, 1)],
  [(11, 1)],
  [(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
  [(19, 1), (20, 1), (21, 1)],
  [(22, 1)],
  [(23, 1), (24, 1), (25, 1)],
  [(26, 1), (27, 1), (28, 1)],
  [(29, 1), (30, 1), (31, 1), (32, 1)],
  [(33, 1), (34, 1)],
  [(35, 1)],
  [(36, 1), (37, 1), (38, 1), (39, 1)],
  [(40, 1), (41, 1), (42, 1)],
  [(12, 1), (43, 1)],
  [(44, 1), (45, 1)],
  [(27, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)],
  [(51, 1), (52, 1), (53, 1), (54, 1)],
  [(55, 1), (56, 1), (57, 1), (58, 1), (59, 1)],
  [(9, 1), (13, 1), (26, 1), (28, 1), (49, 1), (60, 1), (61, 1), (62, 1)],
  [(12, 1), (63, 1), (64, 1)],
  [(65, 1), (66, 1), (67, 1), (68, 1), (69, 1)],
  [(70, 1), (71, 1)],
  [(33, 1), (60, 1), (64, 1), (72, 1)],
  [(73, 1)],
  [(74, 1)],
  [(75, 1), (76, 1), (77, 1), (78, 1), (79, 1)],
  [(80, 1), (81, 1)],
  [(27, 

In [111]:
lda_test, corpus_test, dict_test = suggest_topics_using_LDA(train, 'body_nltk')

# Score un peu meilleur
# (sauf perplexité, similaire)
# mais les topics sont bien mieux différentiés !

# analyse a developper


Average topic coherence: -1.693.
u_mass Coherence Score:  -1.9204137041631686
c_v Coherence Score:  0.6585732750447372
c_npmi Coherence Score:  0.059319199656616486
Perplexity:  -7.232457244775426


[([(0.023325562, 'get'),
   (0.023011025, 'return'),
   (0.020105002, 'string'),
   (0.020003214, 'value'),
   (0.018746082, 'try'),
   (0.01786467, 'code'),
   (0.016653446, 'class'),
   (0.016168797, 'data'),
   (0.014330446, 'name'),
   (0.01328787, 'function')],
  -1.2575322340034594),
 ([(0.036608372, 'file'),
   (0.026445713, 'error'),
   (0.022631582, 'run'),
   (0.019928072, 'version'),
   (0.017414253, 'try'),
   (0.017098943, 'project'),
   (0.016724398, 'get'),
   (0.01670465, 'command'),
   (0.016152998, 'build'),
   (0.014413662, 'package')],
  -1.381295950820217),
 ([(0.027367882, 'java'),
   (0.023377953, 'spring'),
   (0.018243967, 'org'),
   (0.016142398, 'xml'),
   (0.015233912, 'lang'),
   (0.014754045, 'class'),
   (0.012200265, 'annotation'),
   (0.011746639, 'boot'),
   (0.011067106, 'com'),
   (0.00997688, 'exception')],
  -1.4050787530187625),
 ([(0.01710359, 'work'),
   (0.015046704, 'way'),
   (0.01481001, 'need'),
   (0.014791683, 'find'),
   (0.013810537, 'm

In [112]:
average_jaccard = average_jaccard_similarity(lda_test)
print('Average Jaccard Similarity between Topics:', average_jaccard)

# OK


Average Jaccard Similarity between Topics: 0.07379607191091762


In [113]:
suggest_topics_using_LDA(train, 'body_spacy')


Average topic coherence: -1.954.
u_mass Coherence Score:  -2.3550459429632187
c_v Coherence Score:  0.7128007392311121
c_npmi Coherence Score:  0.07440045842700452
Perplexity:  -7.062042962005138


[([(0.03573449, 'use'),
   (0.025354477, 'work'),
   (0.025089238, 'way'),
   (0.024468334, 'need'),
   (0.021146214, 'find'),
   (0.02104318, 'want'),
   (0.020052418, 'know'),
   (0.017162958, 'look'),
   (0.01581032, 'problem'),
   (0.015306427, 'thank')],
  -1.521364369254603),
 ([(0.040403295, 'file'),
   (0.036238458, 'error'),
   (0.035855368, 'run'),
   (0.023430556, 'try'),
   (0.020638153, 'project'),
   (0.019044356, 'version'),
   (0.018535534, 'build'),
   (0.015798865, 'fail'),
   (0.014345089, 'follow'),
   (0.01429377, 'command')],
  -1.5306544989523536),
 ([(0.02451313, 'spring'),
   (0.02346848, 'annotation'),
   (0.020245597, 'configuration'),
   (0.015179677, 'boot'),
   (0.014448585, 'dependency'),
   (0.013920053, 'bean'),
   (0.013344332, 'class'),
   (0.013338306, 'property'),
   (0.012909644, 'exception'),
   (0.011625976, 'xml')],
  -1.6129435893050572),
 ([(0.042881142, 'try'),
   (0.035093907, 'code'),
   (0.03288134, 'return'),
   (0.032835472, 'function'),

(<gensim.models.ldamodel.LdaModel at 0x7f17ea97d610>,
 [[(0, 1),
   (1, 1),
   (2, 1),
   (3, 1),
   (4, 1),
   (5, 1),
   (6, 1),
   (7, 1),
   (8, 1),
   (9, 1),
   (10, 1),
   (11, 1),
   (12, 1),
   (13, 1),
   (14, 1),
   (15, 1),
   (16, 1),
   (17, 1),
   (18, 1),
   (19, 1),
   (20, 1),
   (21, 1),
   (22, 1),
   (23, 1),
   (24, 1),
   (25, 1),
   (26, 1),
   (27, 1),
   (28, 1),
   (29, 1),
   (30, 1),
   (31, 1),
   (32, 1),
   (33, 1),
   (34, 1),
   (35, 1),
   (36, 1),
   (37, 1)],
  [(38, 1),
   (39, 1),
   (40, 1),
   (41, 1),
   (42, 1),
   (43, 1),
   (44, 1),
   (45, 1),
   (46, 1),
   (47, 1),
   (48, 1),
   (49, 1),
   (50, 1)],
  [(50, 1),
   (51, 1),
   (52, 1),
   (53, 1),
   (54, 1),
   (55, 1),
   (56, 1),
   (57, 1),
   (58, 1)],
  [(13, 1),
   (16, 1),
   (29, 1),
   (31, 1),
   (50, 1),
   (55, 1),
   (57, 1),
   (58, 1),
   (59, 1),
   (60, 1),
   (61, 1),
   (62, 1),
   (63, 1),
   (64, 1),
   (65, 1),
   (66, 1),
   (67, 1),
   (68, 1),
   (69, 1),
   (7

### Multicore


In [116]:
# add random state
# add grid search cv

def suggest_topics_using_LDA(df, feature):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
    id2word = gensim_dictionary.id2token

    model = LdaMulticore(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

    top_topics = model.top_topics(corpus, topn=num_topics)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    # = umass ?
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.10f.' % avg_topic_coherence)

    # Compute Coherence Score (cv)
    coherence_cv = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_v')
    coherence_lda_cv = coherence_cv.get_coherence()
    print('Coherence Score: ', coherence_lda_cv)

    # Compute Coherence Score (Umass)
    coherence_umass = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='u_mass')
    coherence_lda_umass = coherence_umass.get_coherence()
    print('u_mass Coherence Score: ', coherence_lda_umass)

    # Compute Coherence Score (npmi)
    coherence_npmi = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_npmi')
    coherence_lda_npmi = coherence_npmi.get_coherence()
    print('c_npmi Coherence Score: ', coherence_lda_npmi)

    # Perplexity is not a coherence score but a measure of how well the model predicts a sample.
    # A lower perplexity indicates better model performance.
    perplexity = model.log_perplexity(corpus)
    print('Perplexity: ', perplexity)

    # Visualize the topics
    vis_data = gensimvis.prepare(model, corpus, gensim_dictionary)
    display(pyLDAvis.display(vis_data))

    # Uncomment the next line if you want to save the plot to a file
    # pyLDAvis.save_html(vis_data, 'lda_vis.html')

    pprint(top_topics)
    # to print all topics
    # pprint(model.print_topics())


suggest_topics_using_LDA(train, 'body_nltk')
# On gagne + d'1 min


Average topic coherence: -1.3483269850.
Coherence Score:  0.5441994616136396
u_mass Coherence Score:  -1.6622678814430216
c_npmi Coherence Score:  0.026879235151347554
Perplexity:  -7.116280366257143


KeyboardInterrupt: 

### Hyperparameters tuning


#### number of topics, alpha, beta


In [115]:
def train_lda_model(num_topics, alpha, beta, corpus, id2word, chunksize, iterations, passes, eval_every):
    return LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha=alpha,
        eta=beta,
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )

def suggest_topics_using_LDA(df, feature):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Set training parameters.
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make an index to word dictionary.
    temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
    id2word = gensim_dictionary.id2token

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'num_topics': [5, 10, 15],  # Example values, adjust as needed
        'alpha': ['auto', 'symmetric', 'asymmetric'],
        'eta': ['auto', 'symmetric', 'asymmetric']
    }

    # Placeholder LdaModel instantiation
    lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize,
                     iterations=iterations, passes=passes, eval_every=eval_every)

    grid_search = GridSearchCV(
        estimator=lda_model,
        param_grid=param_grid,
        scoring='c_v',  # You can use other scoring metrics
        cv=3  # Adjust the number of cross-validation folds as needed
    )

    grid_search.fit(corpus)

    # Get the best parameters
    best_params = grid_search.best_params_

    print('Best Parameters:', best_params)

    # Train the LDA model with the best parameters
    best_lda_model = train_lda_model(**best_params, corpus=corpus, id2word=id2word,
                                     chunksize=chunksize, iterations=iterations,
                                     passes=passes, eval_every=eval_every)

    # Rest of your code remains the same
    top_topics = best_lda_model.top_topics(corpus, topn=20)
    # ... (the rest of your code)

    # Visualize the topics
    vis_data = gensimvis.prepare(best_lda_model, corpus, gensim_dictionary)
    # display(pyLDAvis.display(vis_data))

    # Uncomment the next line if you want to save the plot to a file
    # pyLDAvis.save_html(vis_data, 'lda_vis.html')

    # pprint(top_topics)
    # to print all topics
    # pprint(best_lda_model.print_topics())

# Example usage
suggest_topics_using_LDA(train, 'title_nltk')


InvalidParameterError: The 'estimator' parameter of GridSearchCV must be an object implementing 'fit'. Got <gensim.models.ldamodel.LdaModel object at 0x7f17e554c810> instead.

## tfidf + nmf


## Word2Vec + K-Means ?


## doc2vec


## LSA ? = tsidf + svd
