# **Catégorisez automatiquement des questions**

## partie 3/8 : Prédiction de tags, approche non-supervisée

### <br> Proposition de mots clés, de type LDA avec visualisation en 2D des topics

<br>


## Importation des librairies, réglages


In [19]:
import os, sys, random
import ast
# from zipfile import ZipFile
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from pprint import pprint

# Visualisation
import matplotlib.pyplot as plt
# import seaborn as sns
import plotly.express as px

# Feature engineering
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.models.ldamulticore import LdaMulticore


from sklearn.model_selection import GridSearchCV

print('\nPython version ' + sys.version)
print('pyLDAvis version ' + pyLDAvis.__version__)

# Modify if necessary
num_cores = os.cpu_count()
print(f"\nNumber of CPU cores: {num_cores}")
pandarallel.initialize(progress_bar=False, nb_workers=6)



Python version 3.11.4 (main, Jul  5 2023, 14:15:25) [GCC 11.2.0]
pyLDAvis version 3.4.0

Number of CPU cores: 8
INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Fonctions


In [3]:
def get_missing_values(df):
    """Generates a DataFrame containing the count and proportion of missing values for each feature.

    Args:
        df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
        pandas.DataFrame: A DataFrame with columns for the feature name, count of missing values,
        count of non-missing values, proportion of missing values, and data type for each feature.
    """
    # Count the missing values for each column
    missing = df.isna().sum()

    # Calculate the percentage of missing values
    percent_missing = df.isna().mean() * 100

    # Create a DataFrame to store the results
    missings_df = pd.DataFrame({
        'column_name': df.columns,
        'missing': missing,
        'present': df.shape[0] - missing,  # Count of non-missing values
        'percent_missing': percent_missing.round(2),  # Rounded to 2 decimal places
        'type': df.dtypes
    })

    # Sort the DataFrame by the count of missing values
    missings_df.sort_values('missing', inplace=True)

    return missings_df

# with pd.option_context('display.max_rows', 1000):
#   display(get_missing_values(df))


def quick_look(df, miss=True):
    """
    Display a quick overview of a DataFrame, including shape, head, tail, unique values, and duplicates.

    Args:
        df (pandas.DataFrame): The input DataFrame to inspect.
        check_missing (bool, optional): Whether to check and display missing values (default is True).

    The function provides a summary of the DataFrame, including its shape, the first and last rows, the count of unique values per column, and the number of duplicates.
    If `check_missing` is set to True, it also displays missing value information.
    """
    print(f'shape : {df.shape}')

    display(df.head())
    display(df.tail())

    print('uniques :')
    display(df.nunique())

    print('Doublons ? ', df.duplicated(keep='first').sum(), '\n')

    if miss:
        display(get_missing_values(df))



### import


In [4]:
# import

train = pd.read_csv('./../data/cleaned_data/train_bow_uniques.csv', sep=',')
test = pd.read_csv('./../data/cleaned_data/test_bow_uniques.csv', sep=',')

quick_look(train)


shape : (42898, 8)


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
0,2019-06-05 15:13:02,How to use memset while handling strings in C++?,I am from Python background and recently learn...,"['c++', 'initialization', 'c-strings', 'string...","['memset', 'handle', 'string']","['memset', 'handle', 'string', 'python', 'back...","['use', 'memset', 'handle', 'string']","['background', 'learn', 'function', 'memset', ..."
1,2018-10-31 12:35:02,How to correct spelling in google docs using k...,I would like to be able to replace a misspelle...,"['gmail', 'keyboard-shortcuts', 'google-docs',...","['correct', 'spell', 'google', 'doc', 'keyboar...","['correct', 'spell', 'google', 'doc', 'shortcu...","['correct', 'spelling', 'keyboard', 'shortcut']","['like', 'replace', 'word', 'recommend', 'corr..."
2,2020-09-19 10:40:23,live server vscode on another computer,I have 2 computers. when I open the project wi...,"['visual-studio-code', 'server', 'localhost', ...","['server', 'vscode', 'computer']","['server', 'vscode', 'computer', 'open', 'proj...","['server', 'vscode', 'computer']","['computer', 'open', 'project', 'server', 'url..."
3,2012-10-23 16:47:04,django ajax post 403 forbidden,using django 1.4 im getting a 403 error when i...,"['javascript', 'ajax', 'django', 'http-post', ...","['django', 'ajax', 'forbidden']","['django', 'ajax', 'get', 'error', 'try', 'pos...",['forbid'],"['django', 'error', 'try', 'post', 'javascript..."
4,2019-04-21 16:10:24,Listen to changes and reload container on code...,I am using docker-compose in visual studio 201...,"['angular', 'visual-studio', 'docker', 'docker...","['listen', 'change', 'reload', 'container', 'c...","['listen', 'change', 'reload', 'container', 'c...","['listen', 'change', 'reload', 'container', 'c...","['docker', 'compose', 'studio', 'window', 'run..."


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
42893,2017-02-23 11:34:31,Do we need clear MDC after HTTP request in Spring,According to this answer thread local variable...,"['java', 'spring', 'logging', 'log4j', 'logback']","['need', 'mdc', 'request', 'spring']","['need', 'mdc', 'request', 'spring', 'accord',...","['need', 'request']","['accord', 'answer', 'thread', 'variable', 'us..."
42894,2011-10-13 20:57:32,How to make i18n with Handlebars.js (mustache ...,I'm currently using Handlebars.js (associated ...,"['javascript', 'jquery', 'internationalization...","['make', 'i18n', 'handlebar', 'template']","['make', 'i18n', 'handlebar', 'template', 'ass...",['template'],"['associate', 'web', 'app', 'client', 'render'..."
42895,2012-09-06 00:16:46,How can I make R read my environmental variables?,I am running R on EC2 spot instances and I nee...,"['linux', 'r', 'ubuntu', 'amazon-ec2', 'enviro...","['make', 'read', 'variable']","['make', 'read', 'variable', 'run', 'spot', 'i...","['read', 'variable']","['run', 'spot', 'instance', 'need', 'terminate..."
42896,2021-03-23 03:50:50,How to prevent react-query from fetching initi...,I'm using react-query v3.13 to fetch data from...,"['javascript', 'reactjs', 'fetch', 'react-quer...","['prevent', 'query', 'fetch', 'enable']","['prevent', 'query', 'fetch', 'enable', 'data'...","['prevent', 'react', 'query', 'fetch', 'enable']","['react', 'query', 'fetch', 'datum', 'want', '..."
42897,2016-03-17 04:19:15,Inserting into table with an Identity column w...,I have a table A_tbl in my database. I have cr...,"['sql', 'sql-server', 'database', 'ssms', 'dat...","['insert', 'table', 'identity', 'column', 'rep...","['insert', 'table', 'identity', 'column', 'rep...","['insert', 'table', 'column', 'replication', '...","['table', 'database', 'create', 'trigger', 'ca..."


uniques :


CreationDate    42893
title           42897
body            42898
all_tags        41513
title_nltk      42171
body_nltk       42898
title_spacy     37346
body_spacy      42891
dtype: int64

Doublons ?  0 



Unnamed: 0,column_name,missing,present,percent_missing,type
CreationDate,CreationDate,0,42898,0.0,object
title,title,0,42898,0.0,object
body,body,0,42898,0.0,object
all_tags,all_tags,0,42898,0.0,object
title_nltk,title_nltk,0,42898,0.0,object
body_nltk,body_nltk,0,42898,0.0,object
title_spacy,title_spacy,0,42898,0.0,object
body_spacy,body_spacy,0,42898,0.0,object


In [5]:
# Tt semble OK, sauf les types : le fait d'exporter nos données a converti nos listes de tokens en str

def turn_str_back_into_list(df):
    """Correct the type change due to .csv export"""

    df['title_nltk'] = df['title_nltk'].apply(ast.literal_eval)
    df['body_nltk'] = df['body_nltk'].apply(ast.literal_eval)
    df['title_spacy'] = df['title_spacy'].apply(ast.literal_eval)
    df['body_spacy'] = df['body_spacy'].apply(ast.literal_eval)


turn_str_back_into_list(train)
turn_str_back_into_list(test)


In [6]:
# Vérif
train[['title_nltk', 'body_nltk', 'title_spacy', 'body_spacy']].map(len).describe()

# OK, pas de liste vide (min = 1)


Unnamed: 0,title_nltk,body_nltk,title_spacy,body_spacy
count,42898.0,42898.0,42898.0,42898.0
mean,4.409903,39.608653,3.311413,29.030444
std,1.732934,27.99795,1.619366,20.274162
min,1.0,2.0,1.0,1.0
25%,3.0,21.0,2.0,16.0
50%,4.0,32.0,3.0,24.0
75%,5.0,50.0,4.0,37.0
max,14.0,368.0,12.0,307.0


In [7]:
# quick_look(test)


In [8]:
test[['title_nltk', 'body_nltk', 'title_spacy', 'body_spacy']].map(len).describe()
# OK


Unnamed: 0,title_nltk,body_nltk,title_spacy,body_spacy
count,4767.0,4767.0,4767.0,4767.0
mean,4.395427,39.790854,3.284665,29.20516
std,1.716085,27.418228,1.620959,19.973006
min,1.0,3.0,1.0,1.0
25%,3.0,21.0,2.0,16.0
50%,4.0,32.0,3.0,24.0
75%,5.0,51.0,4.0,37.0
max,13.0,268.0,12.0,194.0


In [9]:
# Utile si nos inputs st sous forme de string
# mais il semble qu'on va plutôt conserver la liste de tokens au final

def fix_false_null_values(df):
    """
    Replace NaN values in specified columns with the string 'null'.
    ! USE ONLY AFTER VERIFYING that the NaN values are all the "null" string
    """
    df.loc[(df['title_nltk'].isna()), 'title_nltk'] = 'null'
    df.loc[(df['title_spacy'].isna()), 'title_spacy'] = 'null'


# fix_false_null_values(train)
# fix_false_null_values(test)

# Check for null values in the entire DataFrame
# null_values = train[train.isnull().any(axis=1)]

# Print the rows with null values
# print(null_values)


In [10]:
# quick_look(train)
# quick_look(test)


In [11]:
index = [4532, 8280, 12992, 14957, 22934, 24964, 25950]

display(train.loc[train.index.isin(index), :])

# OK


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
4532,2013-10-23 22:23:31,How to change type of id in Microsoft.AspNet.I...,"(ASP.NET MVC 5, EF6, VS2013)\nI'm trying to fi...","['asp.net-mvc', 'entity-framework', 'asp.net-m...","[change, type, aspnet, identity, entityframewo...","[change, type, aspnet, identity, entityframewo...","[change, type, identity]","[try, figure, change, type, field, string, int..."
8280,2014-06-20 18:46:09,"Bootstrap form input: prevent submit, but allo...",I've got the following problem: \nI use bootst...,"['javascript', 'jquery', 'html', 'forms', 'twi...","[bootstrap, form, input, submit, allow, checking]","[bootstrap, form, input, submit, allow, check,...","[bootstrap, form, input, prevent, submit, allo...","[problem, use, bootstrap, form, input, user, p..."
12992,2017-08-21 19:46:31,PySpark: org.apache.spark.sql.AnalysisExceptio...,"I'm trying to load Parquet data into PySpark, ...","['python', 'apache-spark', 'pyspark', 'apache-...","[pyspark, apache, spark, attribute, name, cont...","[pyspark, apache, spark, attribute, name, cont...","[org.apache.spark.sql, analysisexception, cont...","[try, load, datum, column, space, aliase, erro..."
14957,2018-08-08 12:58:34,How to break ForEach Loop in TypeScript,"I have a the below code, on which i am unable ...","['javascript', 'angular', 'typescript', 'forea...","[break, loop, typescript]","[break, loop, typescript, code, condition, fun...",[break],"[code, break, loop, condition, function, let, ..."
22934,2014-11-26 18:26:05,Python: Creating a 2D histogram from a numpy m...,"I'm new to python.\nI have a numpy matrix, of ...","['python', 'numpy', 'matrix', 'matplotlib', 'h...","[python, create, histogram, matrix]","[python, create, histogram, matrix, dimension,...","[python, create, histogram, matrix]","[python, matrix, dimension, value, range, want..."
24964,2011-11-28 02:41:21,SSRS line chart not connecting data points,I've looked high and low and can't seem to fin...,"['join', 'reporting-services', 'graph', 'chart...","[line, chart, connect, data, point]","[line, chart, connect, data, point, look, seem...","[line, chart, connect, datum, point]","[look, find, answer, appear, issue, think, lin..."
25950,2014-08-21 15:58:49,GS1 barcode parsing,We need to parse the GS1 datamatrix barcode wh...,"['parsing', 'barcode', 'datamatrix', 'gs1-data...","[barcode, parse]","[barcode, parse, need, provide, party, know, l...","[barcode, parse]","[need, parse, barcode, provide, party, know, u..."


## LDA


In [13]:
# add random state
# add grid search cv
# add other score ? umap? c_v ? ...

def suggest_topics_using_LDA(df, feature):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
    id2word = gensim_dictionary.id2token

    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

    top_topics = model.top_topics(corpus, topn=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    # = umass ?
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    # Compute Coherence Score (cv)
    coherence_cv = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_v')
    coherence_lda_cv = coherence_cv.get_coherence()
    print('Coherence Score: ', coherence_lda_cv)

    # Compute Coherence Score (Umass)
    coherence_umass = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='u_mass')
    coherence_lda_umass = coherence_umass.get_coherence()
    print('u_mass Coherence Score: ', coherence_lda_umass)

    # Compute Coherence Score (npmi)
    coherence_npmi = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_npmi')
    coherence_lda_npmi = coherence_npmi.get_coherence()
    print('c_npmi Coherence Score: ', coherence_lda_npmi)

    # Perplexity is not a coherence score but a measure of how well the model predicts a sample.
    # A lower perplexity indicates better model performance.
    perplexity = model.log_perplexity(corpus)
    print('Perplexity: ', perplexity)

    # Visualize the topics
    vis_data = gensimvis.prepare(model, corpus, gensim_dictionary)
    display(pyLDAvis.display(vis_data))

    # Uncomment the next line if you want to save the plot to a file
    # pyLDAvis.save_html(vis_data, 'lda_vis.html')

    pprint(top_topics)
    # to print all topics
    # pprint(model.print_topics())


suggest_topics_using_LDA(train, 'title_nltk')


Average topic coherence: -9.0413.
Coherence Score:  0.3156975486869463
u_mass Coherence Score:  -9.041277400135566
c_npmi Coherence Score:  -0.12231254910631777
Perplexity:  -7.425965028504642


[([(0.06164836, 'file'),
   (0.060272973, 'error'),
   (0.048858188, 'get'),
   (0.040725674, 'python'),
   (0.03372545, 'find'),
   (0.025303006, 'add'),
   (0.01884588, 'fail'),
   (0.01819421, 'code'),
   (0.018018255, 'make'),
   (0.018012268, 'way'),
   (0.017127857, 'name'),
   (0.01707067, 'load'),
   (0.016818706, 'studio'),
   (0.01608306, 'column'),
   (0.015327221, 'module'),
   (0.015202516, 'project'),
   (0.013440033, 'build'),
   (0.012972178, 'panda'),
   (0.011732383, 'version'),
   (0.011239447, 'query')],
  -5.705465652836063),
 ([(0.053527463, 'data'),
   (0.052379236, 'spring'),
   (0.052367333, 'work'),
   (0.031535074, 'request'),
   (0.026446184, 'api'),
   (0.026381176, 'application'),
   (0.02544197, 'access'),
   (0.022134105, 'check'),
   (0.02053432, 'http'),
   (0.019226894, 'boot'),
   (0.019058391, 'web'),
   (0.016057568, 'user'),
   (0.015959533, 'send'),
   (0.015672248, 'post'),
   (0.014618788, 'store'),
   (0.013739766, 'laravel'),
   (0.013565418,

In [14]:
suggest_topics_using_LDA(train, 'title_spacy')


Average topic coherence: -9.7766.
Coherence Score:  0.3477691256833958
u_mass Coherence Score:  -9.776588721067267
c_npmi Coherence Score:  -0.1431025797275312
Perplexity:  -7.238395265620693


[([(0.10835973, 'file'),
   (0.09688435, 'error'),
   (0.06086005, 'find'),
   (0.041350953, 'run'),
   (0.03794346, 'fail'),
   (0.02807944, 'load'),
   (0.023924937, 'code'),
   (0.02362204, 'module'),
   (0.023345983, 'project'),
   (0.018413885, 'react'),
   (0.01729035, 'exist'),
   (0.013807625, 'library'),
   (0.0132520525, 'open'),
   (0.013233036, 'message'),
   (0.0130719645, 'path'),
   (0.01305269, 'system'),
   (0.012768261, 'exception'),
   (0.012706764, 'link'),
   (0.011910851, 'content'),
   (0.011687631, 'generate')],
  -5.913179643642695),
 ([(0.083013006, 'add'),
   (0.06678199, 'app'),
   (0.041911792, 'build'),
   (0.038679093, 'line'),
   (0.038579706, 'install'),
   (0.036985874, 'command'),
   (0.030889582, 'package'),
   (0.027188307, 'instal'),
   (0.019048074, 'dependency'),
   (0.018930402, 'connect'),
   (0.017880494, 'start'),
   (0.017822098, 'handle'),
   (0.016720975, 'include'),
   (0.016132517, 'debug'),
   (0.014940408, 'option'),
   (0.014921869, '

In [15]:
suggest_topics_using_LDA(train, 'body_nltk')

# Score un peu meilleur
# (sauf perplexité, similaire)
# mais les topics sont bien mieux différentiés !

# analyse a developper


Average topic coherence: -1.8575.
Coherence Score:  0.6704824475777355
u_mass Coherence Score:  -1.8574802310874177
c_npmi Coherence Score:  0.060063639448599826
Perplexity:  -7.236792701322948


[([(0.028580386, 'return'),
   (0.027526842, 'get'),
   (0.024561737, 'string'),
   (0.023785021, 'value'),
   (0.022422098, 'class'),
   (0.02027174, 'try'),
   (0.018268216, 'method'),
   (0.01818943, 'data'),
   (0.017974768, 'name'),
   (0.01739205, 'code'),
   (0.016207874, 'type'),
   (0.015446656, 'create'),
   (0.01480724, 'object'),
   (0.012654083, 'add'),
   (0.012251138, 'function'),
   (0.011793631, 'list'),
   (0.010523391, 'set'),
   (0.010403511, 'call'),
   (0.010036116, 'int'),
   (0.009996053, 'want')],
  -1.4982551234231725),
 ([(0.034917835, 'file'),
   (0.026049733, 'error'),
   (0.025548255, 'run'),
   (0.023229988, 'try'),
   (0.020742258, 'get'),
   (0.018449737, 'version'),
   (0.018261964, 'project'),
   (0.016092231, 'build'),
   (0.015200402, 'find'),
   (0.01413247, 'work'),
   (0.012126904, 'app'),
   (0.011132713, 'issue'),
   (0.010455218, 'fail'),
   (0.009996293, 'add'),
   (0.009521758, 'problem'),
   (0.009459579, 'application'),
   (0.009339218, 'w

In [16]:
suggest_topics_using_LDA(train, 'body_spacy')


Average topic coherence: -2.9474.
Coherence Score:  0.670408527880029
u_mass Coherence Score:  -2.947368562178917
c_npmi Coherence Score:  0.03925316361246643
Perplexity:  -7.073896586051424


[([(0.032774147, 'want'),
   (0.029003568, 'try'),
   (0.027987499, 'code'),
   (0.027289934, 'use'),
   (0.02697759, 'way'),
   (0.02643112, 'work'),
   (0.020954074, 'need'),
   (0.017425137, 'example'),
   (0.01689944, 'create'),
   (0.016653217, 'know'),
   (0.016572604, 'look'),
   (0.015997691, 'value'),
   (0.014462381, 'datum'),
   (0.01402019, 'set'),
   (0.013615186, 'thank'),
   (0.013399625, 'like'),
   (0.013350118, 'function'),
   (0.012765998, 'find'),
   (0.012573221, 'add'),
   (0.012512829, 'result')],
  -1.6818401550611617),
 ([(0.041345816, 'file'),
   (0.038735077, 'error'),
   (0.03418608, 'run'),
   (0.025356077, 'try'),
   (0.01938011, 'project'),
   (0.018320397, 'version'),
   (0.017578678, 'build'),
   (0.01604562, 'find'),
   (0.01543717, 'follow'),
   (0.0141661735, 'fail'),
   (0.01377141, 'command'),
   (0.011738914, 'line'),
   (0.010996138, 'package'),
   (0.01087333, 'work'),
   (0.010870417, 'instal'),
   (0.010634141, 'include'),
   (0.009727899, 'mo

### Multicore


In [21]:
# add random state
# add grid search cv

def suggest_topics_using_LDA(df, feature):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Set training parameters.
    num_topics = 10
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
    id2word = gensim_dictionary.id2token

    model = LdaMulticore(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

    top_topics = model.top_topics(corpus, topn=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    # = umass ?
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    # Compute Coherence Score (cv)
    coherence_cv = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_v')
    coherence_lda_cv = coherence_cv.get_coherence()
    print('Coherence Score: ', coherence_lda_cv)

    # Compute Coherence Score (Umass)
    coherence_umass = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='u_mass')
    coherence_lda_umass = coherence_umass.get_coherence()
    print('u_mass Coherence Score: ', coherence_lda_umass)

    # Compute Coherence Score (npmi)
    coherence_npmi = CoherenceModel(model=model, texts=documents, dictionary=gensim_dictionary, coherence='c_npmi')
    coherence_lda_npmi = coherence_npmi.get_coherence()
    print('c_npmi Coherence Score: ', coherence_lda_npmi)

    # Perplexity is not a coherence score but a measure of how well the model predicts a sample.
    # A lower perplexity indicates better model performance.
    perplexity = model.log_perplexity(corpus)
    print('Perplexity: ', perplexity)

    # Visualize the topics
    vis_data = gensimvis.prepare(model, corpus, gensim_dictionary)
    display(pyLDAvis.display(vis_data))

    # Uncomment the next line if you want to save the plot to a file
    # pyLDAvis.save_html(vis_data, 'lda_vis.html')

    pprint(top_topics)
    # to print all topics
    # pprint(model.print_topics())


suggest_topics_using_LDA(train, 'body_nltk')
# On gagne + d'1 min


Average topic coherence: -1.7252.
Coherence Score:  0.5572270427065924
u_mass Coherence Score:  -1.725248065492907
c_npmi Coherence Score:  0.027866784285247092
Perplexity:  -7.114902869659541


[([(0.013761543, 'get'),
   (0.011229076, 'request'),
   (0.00923494, 'try'),
   (0.009193462, 'server'),
   (0.0088907955, 'http'),
   (0.0087220175, 'data'),
   (0.008517268, 'user'),
   (0.00823555, 'error'),
   (0.007928533, 'work'),
   (0.00789033, 'database'),
   (0.0075831953, 'return'),
   (0.007399121, 'application'),
   (0.007390872, 'set'),
   (0.0072521204, 'create'),
   (0.0070944596, 'code'),
   (0.0068964823, 'string'),
   (0.0067934482, 'add'),
   (0.006642835, 'response'),
   (0.0065974733, 'name'),
   (0.006444389, 'json')],
  -1.5135221843496538),
 ([(0.01043858, 'code'),
   (0.010092093, 'way'),
   (0.008909372, 'time'),
   (0.008476667, 'question'),
   (0.008305336, 'work'),
   (0.008140982, 'make'),
   (0.0074289376, 'need'),
   (0.0070785005, 'get'),
   (0.0070003, 'call'),
   (0.0068747094, 'want'),
   (0.0068150484, 'know'),
   (0.0066110617, 'run'),
   (0.0062169298, 'seem'),
   (0.006144893, 'write'),
   (0.0059292163, 'see'),
   (0.0058878125, 'example'),
  

### Hyperparameters tuning


#### number of topics, alpha, beta


In [None]:
def train_lda_model(num_topics, alpha, beta, corpus, id2word, chunksize, iterations, passes, eval_every):
    return LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha=alpha,
        eta=beta,
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )

def suggest_topics_using_LDA(df, feature):
    documents = df[feature].tolist()
    gensim_dictionary = Dictionary(documents)
    corpus = [gensim_dictionary.doc2bow(doc) for doc in documents]

    # Set training parameters.
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make an index to word dictionary.
    temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
    id2word = gensim_dictionary.id2token

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'num_topics': [5, 10, 15],  # Example values, adjust as needed
        'alpha': ['auto', 'symmetric', 'asymmetric'],
        'eta': ['auto', 'symmetric', 'asymmetric']
    }

    # Placeholder LdaModel instantiation
    lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize,
                     iterations=iterations, passes=passes, eval_every=eval_every)

    grid_search = GridSearchCV(
        estimator=lda_model,
        param_grid=param_grid,
        scoring='c_v',  # You can use other scoring metrics
        cv=3  # Adjust the number of cross-validation folds as needed
    )

    grid_search.fit(corpus)

    # Get the best parameters
    best_params = grid_search.best_params_

    print('Best Parameters:', best_params)

    # Train the LDA model with the best parameters
    best_lda_model = train_lda_model(**best_params, corpus=corpus, id2word=id2word,
                                     chunksize=chunksize, iterations=iterations,
                                     passes=passes, eval_every=eval_every)

    # Rest of your code remains the same
    top_topics = best_lda_model.top_topics(corpus, topn=20)
    # ... (the rest of your code)

    # Visualize the topics
    vis_data = gensimvis.prepare(best_lda_model, corpus, gensim_dictionary)
    # display(pyLDAvis.display(vis_data))

    # Uncomment the next line if you want to save the plot to a file
    # pyLDAvis.save_html(vis_data, 'lda_vis.html')

    # pprint(top_topics)
    # to print all topics
    # pprint(best_lda_model.print_topics())

# Example usage
suggest_topics_using_LDA(train, 'title_nltk')


## tsidf + nmf


## Word2Vec + K-Means ?


## doc2vec


## LSA ? = tsidf + svd
