# **Catégorisez automatiquement des questions**

## partie 3/8 : Prédiction de tags, approche non-supervisée

### <br> Proposition de mots clés, de type LDA avec visualisation en 2D des topics

<br>


## Importation des librairies, réglages


In [28]:
import os, sys, random
import ast
# from zipfile import ZipFile
import numpy as np
import pandas as pd
from pandarallel import pandarallel

# Visualisation
import matplotlib.pyplot as plt
# import seaborn as sns
import plotly.express as px

# Feature engineering
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus
from gensim.models import LdaModel

# Modify if necessary
num_cores = os.cpu_count()
print(f"\nNumber of CPU cores: {num_cores}")
pandarallel.initialize(progress_bar=False, nb_workers=6)



Number of CPU cores: 8
INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Fonctions


In [29]:
def get_missing_values(df):
    """Generates a DataFrame containing the count and proportion of missing values for each feature.

    Args:
        df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
        pandas.DataFrame: A DataFrame with columns for the feature name, count of missing values,
        count of non-missing values, proportion of missing values, and data type for each feature.
    """
    # Count the missing values for each column
    missing = df.isna().sum()

    # Calculate the percentage of missing values
    percent_missing = df.isna().mean() * 100

    # Create a DataFrame to store the results
    missings_df = pd.DataFrame({
        'column_name': df.columns,
        'missing': missing,
        'present': df.shape[0] - missing,  # Count of non-missing values
        'percent_missing': percent_missing.round(2),  # Rounded to 2 decimal places
        'type': df.dtypes
    })

    # Sort the DataFrame by the count of missing values
    missings_df.sort_values('missing', inplace=True)

    return missings_df

# with pd.option_context('display.max_rows', 1000):
#   display(get_missing_values(df))


def quick_look(df, miss=True):
    """
    Display a quick overview of a DataFrame, including shape, head, tail, unique values, and duplicates.

    Args:
        df (pandas.DataFrame): The input DataFrame to inspect.
        check_missing (bool, optional): Whether to check and display missing values (default is True).

    The function provides a summary of the DataFrame, including its shape, the first and last rows, the count of unique values per column, and the number of duplicates.
    If `check_missing` is set to True, it also displays missing value information.
    """
    print(f'shape : {df.shape}')

    display(df.head())
    display(df.tail())

    print('uniques :')
    display(df.nunique())

    print('Doublons ? ', df.duplicated(keep='first').sum(), '\n')

    if miss:
        display(get_missing_values(df))



### import


In [30]:
# import

train = pd.read_csv('./../data/cleaned_data/train_bow_uniques.csv', sep=',')
test = pd.read_csv('./../data/cleaned_data/test_bow_uniques.csv', sep=',')

quick_look(train)


shape : (42898, 8)


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
0,2019-06-05 15:13:02,How to use memset while handling strings in C++?,I am from Python background and recently learn...,"['c++', 'initialization', 'c-strings', 'string...","['memset', 'handle', 'string']","['memset', 'handle', 'string', 'python', 'back...","['use', 'memset', 'handle', 'string']","['background', 'learn', 'function', 'memset', ..."
1,2018-10-31 12:35:02,How to correct spelling in google docs using k...,I would like to be able to replace a misspelle...,"['gmail', 'keyboard-shortcuts', 'google-docs',...","['correct', 'spell', 'google', 'doc', 'keyboar...","['correct', 'spell', 'google', 'doc', 'shortcu...","['correct', 'spelling', 'keyboard', 'shortcut']","['like', 'replace', 'word', 'recommend', 'corr..."
2,2020-09-19 10:40:23,live server vscode on another computer,I have 2 computers. when I open the project wi...,"['visual-studio-code', 'server', 'localhost', ...","['server', 'vscode', 'computer']","['server', 'vscode', 'computer', 'open', 'proj...","['server', 'vscode', 'computer']","['computer', 'open', 'project', 'server', 'url..."
3,2012-10-23 16:47:04,django ajax post 403 forbidden,using django 1.4 im getting a 403 error when i...,"['javascript', 'ajax', 'django', 'http-post', ...","['django', 'ajax', 'forbidden']","['django', 'ajax', 'get', 'error', 'try', 'pos...",['forbid'],"['django', 'error', 'try', 'post', 'javascript..."
4,2019-04-21 16:10:24,Listen to changes and reload container on code...,I am using docker-compose in visual studio 201...,"['angular', 'visual-studio', 'docker', 'docker...","['listen', 'change', 'reload', 'container', 'c...","['listen', 'change', 'reload', 'container', 'c...","['listen', 'change', 'reload', 'container', 'c...","['docker', 'compose', 'studio', 'window', 'run..."


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
42893,2017-02-23 11:34:31,Do we need clear MDC after HTTP request in Spring,According to this answer thread local variable...,"['java', 'spring', 'logging', 'log4j', 'logback']","['need', 'mdc', 'request', 'spring']","['need', 'mdc', 'request', 'spring', 'accord',...","['need', 'request']","['accord', 'answer', 'thread', 'variable', 'us..."
42894,2011-10-13 20:57:32,How to make i18n with Handlebars.js (mustache ...,I'm currently using Handlebars.js (associated ...,"['javascript', 'jquery', 'internationalization...","['make', 'i18n', 'handlebar', 'template']","['make', 'i18n', 'handlebar', 'template', 'ass...",['template'],"['associate', 'web', 'app', 'client', 'render'..."
42895,2012-09-06 00:16:46,How can I make R read my environmental variables?,I am running R on EC2 spot instances and I nee...,"['linux', 'r', 'ubuntu', 'amazon-ec2', 'enviro...","['make', 'read', 'variable']","['make', 'read', 'variable', 'run', 'spot', 'i...","['read', 'variable']","['run', 'spot', 'instance', 'need', 'terminate..."
42896,2021-03-23 03:50:50,How to prevent react-query from fetching initi...,I'm using react-query v3.13 to fetch data from...,"['javascript', 'reactjs', 'fetch', 'react-quer...","['prevent', 'query', 'fetch', 'enable']","['prevent', 'query', 'fetch', 'enable', 'data'...","['prevent', 'react', 'query', 'fetch', 'enable']","['react', 'query', 'fetch', 'datum', 'want', '..."
42897,2016-03-17 04:19:15,Inserting into table with an Identity column w...,I have a table A_tbl in my database. I have cr...,"['sql', 'sql-server', 'database', 'ssms', 'dat...","['insert', 'table', 'identity', 'column', 'rep...","['insert', 'table', 'identity', 'column', 'rep...","['insert', 'table', 'column', 'replication', '...","['table', 'database', 'create', 'trigger', 'ca..."


uniques :


CreationDate    42893
title           42897
body            42898
all_tags        41513
title_nltk      42171
body_nltk       42898
title_spacy     37346
body_spacy      42891
dtype: int64

Doublons ?  0 



Unnamed: 0,column_name,missing,present,percent_missing,type
CreationDate,CreationDate,0,42898,0.0,object
title,title,0,42898,0.0,object
body,body,0,42898,0.0,object
all_tags,all_tags,0,42898,0.0,object
title_nltk,title_nltk,0,42898,0.0,object
body_nltk,body_nltk,0,42898,0.0,object
title_spacy,title_spacy,0,42898,0.0,object
body_spacy,body_spacy,0,42898,0.0,object


In [31]:
# Tt semble OK, sauf les types : le fait d'exporter nos données a converti nos listes de tokens en str

def turn_str_back_into_list(df):
    """Correct the type change due to .csv export"""

    df['title_nltk'] = df['title_nltk'].apply(ast.literal_eval)
    df['body_nltk'] = df['body_nltk'].apply(ast.literal_eval)
    df['title_spacy'] = df['title_spacy'].apply(ast.literal_eval)
    df['body_spacy'] = df['body_spacy'].apply(ast.literal_eval)


turn_str_back_into_list(train)
turn_str_back_into_list(test)


In [32]:
# Vérif
train[['title_nltk', 'body_nltk', 'title_spacy', 'body_spacy']].map(len).describe()

# OK, pas de liste vide (min = 1)


Unnamed: 0,title_nltk,body_nltk,title_spacy,body_spacy
count,42898.0,42898.0,42898.0,42898.0
mean,4.409903,39.608653,3.311413,29.030444
std,1.732934,27.99795,1.619366,20.274162
min,1.0,2.0,1.0,1.0
25%,3.0,21.0,2.0,16.0
50%,4.0,32.0,3.0,24.0
75%,5.0,50.0,4.0,37.0
max,14.0,368.0,12.0,307.0


In [26]:
quick_look(test)


shape : (4767, 8)


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
0,2016-10-01 23:46:51,"Didn't find class ""com.google.firebase.provide...","Before, my program run well. But When I just u...","['java', 'android', 'android-studio', 'error-h...","['find', 'class', 'com', 'google', 'firebase',...","['find', 'class', 'com', 'google', 'firebase',...","['find', 'class']","['program', 'run', 'update', 'studio', 'versio..."
1,2014-06-25 18:14:35,Why do I get `java.lang.NoClassDefFoundError: ...,Here is a simple test I'm using to invoke a Sc...,"['java', 'scala', 'maven', 'noclassdeffounderr...","['get', 'lang', 'noclassdeffounderror', 'scala...","['get', 'lang', 'noclassdeffounderror', 'scala...","['run', 'code']","['test', 'invoke', 'method', 'main(string', 'a..."
2,2013-06-26 03:29:49,django bulk create ignore duplicates,I'm trying to bulk insert into a MySQL db for ...,"['python', 'mysql', 'django', 'bulkinsert', 'b...","['django', 'bulk', 'create', 'ignore', 'duplic...","['django', 'bulk', 'create', 'ignore', 'duplic...","['bulk', 'create', 'ignore', 'duplicate']","['try', 'bulk', 'insert', 'mysql', 'dataset', ..."
3,2018-08-23 05:20:56,What is the difference between PyCharm Virtual...,"When I create a new project in PyCharm, it cre...","['python', 'pycharm', 'anaconda', 'environment...","['difference', 'environment', 'anaconda']","['difference', 'environment', 'anaconda', 'cre...",['difference'],"['create', 'project', 'read', 'execute', 'scri..."
4,2013-04-28 12:20:24,HTML form action and onsubmit issues,I want to run JavaScript user validation on so...,"['javascript', 'html', 'forms', 'action', 'ons...","['form', 'action', 'onsubmit', 'issue']","['form', 'action', 'onsubmit', 'issue', 'want'...","['html', 'form', 'action', 'onsubmit', 'issue']","['want', 'run', 'user', 'validation', 'textbox..."


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
4762,2018-07-06 04:09:12,Execute task every second using Work Manager API,Work Manager is a new API and I try to execute...,"['android', 'kotlin', 'background-process', 'b...","['execute', 'task', 'work', 'manager', 'api']","['execute', 'task', 'work', 'manager', 'api', ...","['execute', 'task']","['api', 'try', 'execute', 'task', 'second', 'w..."
4763,2013-08-12 14:24:04,Exploiting JavaScript's eval() method,Many developers believe that JavaScript's eval...,"['javascript', 'security', 'eval', 'client-sid...","['javascript', 'eval', 'method']","['javascript', 'developer', 'believe', 'method...","['exploit', 'eval', 'method']","['developer', 'believe', 'eval', 'method', 'av..."
4764,2013-03-20 11:00:25,Stored procedure using SP_SEND_DBMAIL sending ...,I have a stored procedure that is run every ni...,"['sql', 'sql-server', 'email', 'duplicates', '...","['procedure', 'send', 'duplicate', 'email', 'r...","['procedure', 'send', 'duplicate', 'email', 'r...","['store', 'procedure', 'send', 'email', 'recip...","['store', 'procedure', 'run', 'night', 'suppos..."
4765,2011-11-22 22:12:48,Node.js doesn't have a good ORM for managing M...,"I need to use Node.js, but it doesn't have a g...","['mysql', 'database', 'orm', 'node.js', 'module']","['orm', 'manage', 'schema', 'migration', 'sqla...","['orm', 'manage', 'schema', 'migration', 'sqla...","['manage', 'migration', 'use']","['need', 'use', 'plan', 'define', 'schema', 'n..."
4766,2018-03-05 14:11:34,Django redirecting http -> https,I am running:\npython manage.py runserver loca...,"['python', 'django', 'http', 'redirect', 'https']","['django', 'redirect', 'http']","['django', 'redirect', 'http', 'run', 'python'...","['redirect', 'http', 'https']","['run', 'python', 'runserver', 'redirect', 'ht..."


uniques :


CreationDate    4767
title           4767
body            4767
all_tags        4737
title_nltk      4749
body_nltk       4767
title_spacy     4467
body_spacy      4767
dtype: int64

Doublons ?  0 



Unnamed: 0,column_name,missing,present,percent_missing,type
CreationDate,CreationDate,0,4767,0.0,object
title,title,0,4767,0.0,object
body,body,0,4767,0.0,object
all_tags,all_tags,0,4767,0.0,object
title_nltk,title_nltk,0,4767,0.0,object
body_nltk,body_nltk,0,4767,0.0,object
title_spacy,title_spacy,0,4767,0.0,object
body_spacy,body_spacy,0,4767,0.0,object


In [33]:
test[['title_nltk', 'body_nltk', 'title_spacy', 'body_spacy']].map(len).describe()
# OK


Unnamed: 0,title_nltk,body_nltk,title_spacy,body_spacy
count,4767.0,4767.0,4767.0,4767.0
mean,4.395427,39.790854,3.284665,29.20516
std,1.716085,27.418228,1.620959,19.973006
min,1.0,3.0,1.0,1.0
25%,3.0,21.0,2.0,16.0
50%,4.0,32.0,3.0,24.0
75%,5.0,51.0,4.0,37.0
max,13.0,268.0,12.0,194.0


In [6]:
# Utile si nos inputs st sous forme de string
# mais il semble qu'on va plutôt conserver la liste de tokens au final

def fix_false_null_values(df):
    """
    Replace NaN values in specified columns with the string 'null'.
    ! USE ONLY AFTER VERIFYING that the NaN values are all the "null" string
    """
    df.loc[(df['title_nltk'].isna()), 'title_nltk'] = 'null'
    df.loc[(df['title_spacy'].isna()), 'title_spacy'] = 'null'


# fix_false_null_values(train)
# fix_false_null_values(test)

# Check for null values in the entire DataFrame
# null_values = train[train.isnull().any(axis=1)]

# Print the rows with null values
# print(null_values)


Empty DataFrame
Columns: [CreationDate, title, body, all_tags, title_nltk, body_nltk, title_spacy, body_spacy, __attribute__, __bridge, __call__, __declspec, __dict__, __dirname, __file__, __getitem__, __init__, __m128, __new__, __str__, __unicode__, __webpack_require__, _auth, _blank, _files, _get, _id, _layout, _libs, _main, _mysql, _next, _objc_class_, _post, _session, _ssl, _start, _tp, _x, a2dp, a4, a9, aa, aac, aapt, aapt2, aar, ab, abc, abi, ability, abort, absent, absolute, abspath, abstract, abstraction, abstractprotocol, acceleration, accelerometer, accent, accept, access, access_fine_location, access_token, accessdenied, accessdeniedexception, accessibility, accessor, accesstoken, accomplish, accordion, account, accountcontroller, accuracy, ace, achieve, ack, acl, acquire, across, act, action, actionbar, actionbaractivity, actionbarsherlock, actioncontroller, actionlink, actionmailer, actionresult, actionsheet, actionview, activate, activatedroute, activation, activator, act

In [7]:
# quick_look(train)
# quick_look(test)


shape : (43016, 6753)


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy,__attribute__,__bridge,...,zone,zoneddatetime,zoneid,zookeeper,zoom,zooming,zsh,zshrc,zuul,zxing
0,2013-08-23 23:28:22,How to implement a ViewPager with different Fr...,When I start an activity which implements view...,"['android', 'android-layout', 'android-fragmen...",implement viewpager fragment layout,implement viewpager fragment layouts start act...,implement,start activity implement viewpager create frag...,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2015-04-26 06:13:36,Cannot subscript a value of [AnyObject]? with ...,This is in a class extending PFQueryTableViewC...,"['ios', 'xcode', 'swift', 'parse-platform', 'x...",subscript value anyobject index type int,subscript value anyobject index type int class...,subscript value index type,class extend follow error row cast way subscri...,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2014-08-06 12:33:53,Equivalent to java packages in C#,"I have been looking for a way to make a ""packa...","['java', 'c#', 'eclipse', 'visual-studio-2013'...",equivalent java package c,equivalent java package c look way make folder...,package c,look way package folder studio express know pr...,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2014-06-05 18:35:37,How to use UIVisualEffectView to Blur Image?,Could someone give a small example of applying...,"['ios', 'objective-c', 'uiview', 'uikit', 'uiv...",use blur image,use blur image someone give example apply try ...,use uivisualeffectview,example apply blur image try figure code uivis...,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2013-06-28 11:53:56,How can I sort arrays and data in PHP?,\nThis question is intended as a reference for...,"['php', 'arrays', 'sorting', 'object', 'spl']",sort array data php,sort array data php question intend reference ...,sort array datum,question intend reference sort array think cas...,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy,__attribute__,__bridge,...,zone,zoneddatetime,zoneid,zookeeper,zoom,zooming,zsh,zshrc,zuul,zxing
43011,2017-02-24 13:38:36,How to fully dump / print variable to console ...,Hey there I am searching for a function which ...,"['javascript', 'dart', 'debugging', 'console',...",dump print console dart language,dump print console dart language hey search fu...,dump print variable console language,search function print variable console languag...,0,0,...,0,0,0,0,0,0,0,0,0,0
43012,2011-10-20 07:21:34,Is there a way to make a method which is not a...,Is there any way of forcing child classes to o...,"['java', 'inheritance', 'overriding', 'abstrac...",way make method,way make method override force child class nee...,way method,way force child class override method need cre...,0,0,...,0,0,0,0,0,0,0,0,0,0
43013,2012-09-11 11:34:25,Can I incorporate both SignalR and a RESTful API?,I have a single page web app developed using A...,"['asp.net', 'rest', 'web-applications', 'asp.n...",incorporate signalr api,incorporate signalr api page web app develop u...,incorporate signalr api,page web app develop convert method push base ...,0,0,...,0,0,0,0,0,0,0,0,0,0
43014,2021-03-23 19:24:04,How can i use php8 attributes instead of annot...,This is what I would like to use:\n#[ORM\Colum...,"['php', 'symfony', 'doctrine-orm', 'doctrine',...",use attribute annotation doctrine,use attribute annotation doctrine like column ...,use attribute annotation doctrine,like use string error annotate support miss,0,0,...,0,0,0,0,0,0,0,0,0,0
43015,2016-03-19 18:27:38,Localizing string resources added via build.gr...,This is in continuation to an answer which hel...,"['android', 'android-studio', 'android-gradle-...",localize string resource add build gradle use,localize string resource add build gradle use ...,localize string resource add build.gradle,continuation answer help post add string resou...,0,0,...,0,0,0,0,0,0,0,0,0,0


uniques :


CreationDate    43012
title           43015
body            43016
all_tags        41627
title_nltk      42538
                ...  
zooming             2
zsh                 2
zshrc               2
zuul                2
zxing               2
Length: 6753, dtype: int64

Doublons ?  0 



Unnamed: 0,column_name,missing,present,percent_missing,type
CreationDate,CreationDate,0,43016,0.0,object
priority,priority,0,43016,0.0,int64
printwriter,printwriter,0,43016,0.0,int64
printstacktrace,printstacktrace,0,43016,0.0,int64
println,println,0,43016,0.0,int64
...,...,...,...,...,...
functools,functools,0,43016,0.0,int64
functionality,functionality,0,43016,0.0,int64
function,function,0,43016,0.0,int64
gauge,gauge,0,43016,0.0,int64


In [34]:
index = [4532, 8280, 12992, 14957, 22934, 24964, 25950]

display(train.loc[train.index.isin(index), :])

# OK


Unnamed: 0,CreationDate,title,body,all_tags,title_nltk,body_nltk,title_spacy,body_spacy
4532,2013-10-23 22:23:31,How to change type of id in Microsoft.AspNet.I...,"(ASP.NET MVC 5, EF6, VS2013)\nI'm trying to fi...","['asp.net-mvc', 'entity-framework', 'asp.net-m...","[change, type, aspnet, identity, entityframewo...","[change, type, aspnet, identity, entityframewo...","[change, type, identity]","[try, figure, change, type, field, string, int..."
8280,2014-06-20 18:46:09,"Bootstrap form input: prevent submit, but allo...",I've got the following problem: \nI use bootst...,"['javascript', 'jquery', 'html', 'forms', 'twi...","[bootstrap, form, input, submit, allow, checking]","[bootstrap, form, input, submit, allow, check,...","[bootstrap, form, input, prevent, submit, allo...","[problem, use, bootstrap, form, input, user, p..."
12992,2017-08-21 19:46:31,PySpark: org.apache.spark.sql.AnalysisExceptio...,"I'm trying to load Parquet data into PySpark, ...","['python', 'apache-spark', 'pyspark', 'apache-...","[pyspark, apache, spark, attribute, name, cont...","[pyspark, apache, spark, attribute, name, cont...","[org.apache.spark.sql, analysisexception, cont...","[try, load, datum, column, space, aliase, erro..."
14957,2018-08-08 12:58:34,How to break ForEach Loop in TypeScript,"I have a the below code, on which i am unable ...","['javascript', 'angular', 'typescript', 'forea...","[break, loop, typescript]","[break, loop, typescript, code, condition, fun...",[break],"[code, break, loop, condition, function, let, ..."
22934,2014-11-26 18:26:05,Python: Creating a 2D histogram from a numpy m...,"I'm new to python.\nI have a numpy matrix, of ...","['python', 'numpy', 'matrix', 'matplotlib', 'h...","[python, create, histogram, matrix]","[python, create, histogram, matrix, dimension,...","[python, create, histogram, matrix]","[python, matrix, dimension, value, range, want..."
24964,2011-11-28 02:41:21,SSRS line chart not connecting data points,I've looked high and low and can't seem to fin...,"['join', 'reporting-services', 'graph', 'chart...","[line, chart, connect, data, point]","[line, chart, connect, data, point, look, seem...","[line, chart, connect, datum, point]","[look, find, answer, appear, issue, think, lin..."
25950,2014-08-21 15:58:49,GS1 barcode parsing,We need to parse the GS1 datamatrix barcode wh...,"['parsing', 'barcode', 'datamatrix', 'gs1-data...","[barcode, parse]","[barcode, parse, need, provide, party, know, l...","[barcode, parse]","[need, parse, barcode, provide, party, know, u..."


## LDA


### fin du preprocessing


In [38]:
# On va d'abord tester le modèle sur les titres des questions
# (bcp moins de data)

docs = train['title_nltk'].tolist()
docs
# Create a dictionary representation of the documents.
gensim_dictionary = Dictionary(docs)

# Filter out extreme cases (optional)
# ...in 1 line of code !
# gensim_dictionary.filter_extremes(no_below=5, no_above=0.5)


In [39]:
# Convert each document to a Gensim bag-of-words representation
corpus = [gensim_dictionary.doc2bow(doc) for doc in docs]


In [42]:
# Print the dictionary and corpus
print("Gensim Dictionary:")
print(gensim_dictionary)
print("\nCorpus:")
print(corpus[:100])


Gensim Dictionary:
Dictionary<5869 unique tokens: ['handle', 'memset', 'string', 'correct', 'doc']...>

Corpus:
[[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1), (14, 1)], [(15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)], [(22, 1), (23, 1), (24, 1)], [(25, 1), (26, 1), (27, 1), (28, 1)], [(29, 1), (30, 1), (31, 1), (32, 1)], [(33, 1), (34, 1), (35, 1), (36, 1), (37, 1)], [(38, 1), (39, 1), (40, 1), (41, 1), (42, 1)], [(43, 1), (44, 1), (45, 1)], [(46, 1), (47, 1), (48, 1)], [(49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)], [(55, 1), (56, 1), (57, 1)], [(15, 1), (58, 1), (59, 1), (60, 1)], [(61, 1)], [(36, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1)], [(68, 1), (69, 1), (70, 1), (71, 1), (72, 1)], [(73, 1), (74, 1), (75, 1), (76, 1), (77, 1)], [(10, 1), (13, 1), (16, 1), (35, 1), (37, 1), (66, 1), (78, 1)], [(15, 1), (79, 1), (80, 1), (81, 1)], [(82, 1), (83, 1), (84, 1), (85, 1)]

In [43]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = gensim_dictionary[0]  # This is only to "load" the dictionary.
id2word = gensim_dictionary.id2token

model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)


CPU times: user 38 s, sys: 0 ns, total: 38 s
Wall time: 38 s


In [45]:
top_topics = model.top_topics(corpus, topn=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)


Average topic coherence: -7.7799.
[([(0.07404588, 'file'),
   (0.07239393, 'error'),
   (0.040507667, 'find'),
   (0.034591738, 'java'),
   (0.030641936, 'create'),
   (0.028457064, 'run'),
   (0.025241299, 'window'),
   (0.02263584, 'fail'),
   (0.021853121, 'code'),
   (0.020503577, 'load'),
   (0.020200983, 'studio'),
   (0.01825978, 'project'),
   (0.01614287, 'build'),
   (0.015767269, 'command'),
   (0.014091775, 'version'),
   (0.013092868, 'docker'),
   (0.0112879565, 'script'),
   (0.010357371, 'path'),
   (0.010225537, 'exception'),
   (0.010181927, 'miss')],
  -4.578558126839868),
 ([(0.08578691, 'get'),
   (0.05414772, 'data'),
   (0.052974146, 'work'),
   (0.040667232, 'type'),
   (0.03223976, 'method'),
   (0.031900488, 'request'),
   (0.031637225, 'make'),
   (0.030435052, 'return'),
   (0.026752584, 'api'),
   (0.020772258, 'http'),
   (0.019734818, 'query'),
   (0.019279169, 'web'),
   (0.018830128, 'key'),
   (0.01817439, 'service'),
   (0.017704422, 'header'),
   (0.

In [9]:
# Identify numeric columns (assuming only/all numeric columns represent BoW)
numeric_columns = train.select_dtypes(include=np.number).columns

# Extract only numeric columns
bow_train = train[numeric_columns]

print(bow_train.shape)


(43016, 6745)


In [None]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
# Convert DataFrame to Gensim Dictionary
dictionary = corpora.Dictionary(bow_train.apply(lambda row: [(col, freq) for col, freq in zip(numeric_columns, row) if freq > 0], axis=1))

# Convert DataFrame to Gensim Corpus
corpus_train = Sparse2Corpus(bow_train.values.T)


In [None]:
# Identify numeric columns (assuming only/all numeric columns represent BoW)
numeric_columns = train.select_dtypes(include='number').columns

# Extract only numeric columns
bow_train = train[numeric_columns]
bow_test = test[numeric_columns]

print(bow_train.shape)
print(bow_test.shape)

# Convert DataFrame to Gensim Corpus and Dictionary
# corpus_train = corpora.MmCorpus(bow_train.values)


: 

In [None]:
corpus_test = corpora.MmCorpus(bow_test.values)


In [None]:
# Convert to Gensim Corpus and Dictionary
# corpus_train = corpora.MmCorpus(matutils.Sparse2Corpus(X_train_bow, documents_columns=False))
dictionary_train = corpora.Dictionary.from_corpus(corpus_train, id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))

# LDA Model training on the training set
lda_model = LdaModel(corpus=corpus_train, id2word=dictionary_train, num_topics=5)  # You can replace 'num_topics' with your desired value

# Preprocess test set
X_test_processed = [preprocess_document(doc) for doc in X_test]

# Transform the test set to Bag-of-Words using the same vectorizer
X_test_bow = vectorizer.transform(X_test_processed)

# Convert to Gensim Corpus
corpus_test = corpora.MmCorpus(matutils.Sparse2Corpus(X_test_bow, documents_columns=False))

# Evaluate on the test set
test_log_likelihood = lda_model.log_perplexity(corpus_test)
print(f"Log Likelihood on Test Set: {test_log_likelihood}")

# Get Topics
topics = lda_model.print_topics()

# Print the topics
for topic in topics:
    print(topic)