In [1]:
# install the package by running the command "pip install git+https://github.com/DHARPA-Project/kiara_plugin.topic_modelling"
# in your virtual environment.
# Alternatively, uncomment and run the line below.
# ! pip install git+https://github.com/DHARPA-Project/kiara_plugin.topic_modelling

In [1]:
from kiara.api import KiaraAPI
kiara = KiaraAPI.instance()

## 1. Data onboarding

Choose one of the two options below (1.1, 1.2 or 1.3) to onboard data.
Option 1.3 uses example data present in the topic modelling plugin.

### 1.1. Get files from zenodo

In [23]:
! kiara operation explain topic_modelling.create_table_from_zenodo


╭─ Operation: [1;3mtopic_modelling.create_table_from_zenodo[0m ────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module retrieves text files from a specified folder   │
│  [3m               [0m  hosted on Zenodo.                                          │
│  [3m               [0m                                                             │
│  [3m               [0m  It takes the DOI and the name of the file as inputs. It    │
│  [3m               [0m  outputs a table with two columns: one for the file names   │
│  [3m               [0m  and the other for the content of these files.              │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m     

In [24]:
create_table_from_zenodo_inputs = {
    "doi": "4596345",
    "file_name": "ChroniclItaly_3.0_original.zip"
}

In [25]:
create_table_from_zenodo_results = kiara.run_job('topic_modelling.create_table_from_zenodo', inputs=create_table_from_zenodo_inputs, comment= " ")

In [26]:
corpus_table_zenodo = create_table_from_zenodo_results['corpus_table']

In [27]:
create_table_from_zenodo_results

### 1.2. Get files from github

In [28]:
! kiara operation explain create.table_from_github_files


╭─ Operation: [1;3mcreate.table_from_github_files[0m ──────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Onboards text files from a Github repository.              │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mdownload_[0m[3m [0m  string   The        [1myes

In [29]:
create_table_from_github_files_inputs = {
    "download_github_files__user": "DHARPA-Project",
    "download_github_files__repo": "kiara.examples",
    "download_github_files__sub_path": "kiara.examples-main/examples/workshops/dh_benelux_2023/data",
    "download_github_files__include_files": ["txt"]
}

In [30]:
create_table_from_github_files_results = kiara.run_job('create.table_from_github_files', inputs=create_table_from_github_files_inputs, comment=" ")

In [31]:
create_table_from_github_files_results

In [32]:
dl_file_bundle_gh = create_table_from_github_files_results['download_github_files__file_bundle']

### 1.3 Get table from local files

In [2]:
! kiara operation explain import.table.from.local_folder_path


╭─ Operation: [1;3mimport.table.from.local_folder_path[0m ─────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Import a table from a local folder containing text         │
│  [3m               [0m  files.                                                     │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  

In [3]:
# replace path with your local path to the data folder
import_table_from_local_folder_inputs = {
    "path": "/Users/mariella.decrouychan/Documents/GitHub/kiara_plugin.topic_modelling/tests/resources/data/text_corpus/data"
}

In [4]:
import_table_from_local_folder_results = kiara.run_job('import.table.from.local_folder_path', inputs=import_table_from_local_folder_inputs, comment=" ")

In [5]:
import_table_from_local_folder_results

## 2. Subset creation

### 2.1. Get metadata from file names

To visualize the distribution of the corpus, we start by extracting the metadata from the file names.

In [6]:
! kiara operation explain topic_modelling.lccn_metadata


╭─ Operation: [1;3mtopic_modelling.lccn_metadata[0m ───────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module will get metadata from strings that comply     │
│  [3m               [0m  with LCCN pattern: '/sn86069873/1900-01-05/' to get the    │
│  [3m               [0m  publication references and the dates and add those         │
│  [3m               [0m  informations as two new columns.                           │
│  [3m               [0m                                                             │
│  [3m               [0m  In addition, if a mapping scheme is provided between       │
│  [3m               [0m  publication references and publication names, it will      │
│  [3m               [0m  add a column with the publication names. Such a map is     │
│  [3m               [0m  provided in the form of a list of lists with publication   │
│  [3m   

In [7]:
lccn_metadata_inputs = {
    "corpus_table": import_table_from_local_folder_results['table'],
    "column_name": "file_name",
    "map": [["sn84037024","sn84037025"],["La Ragione","La Rassegna"]]   
}

In [8]:
lccn_metadata_results = kiara.run_job('topic_modelling.lccn_metadata', inputs=lccn_metadata_inputs, comment = " ")

In [9]:
lccn_metadata_results

### 2.2. Visualize corpus distribution

We start by getting the distribution data.

In [10]:
! kiara operation explain topic_modelling.corpus_distribution


╭─ Operation: [1;3mtopic_modelling.corpus_distribution[0m ─────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module aggregates a table by day, month or year       │
│  [3m               [0m  from a corpus table that contains a date column. It        │
│  [3m               [0m  returns the distribution over time, which can be used      │
│  [3m               [0m  for display purposes, such as visualization.               │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescrip…[0m

In [11]:
corpus_dist_inputs = {
    "corpus_table": lccn_metadata_results["corpus_table"],
    "periodicity": "month",
    "date_col": "date",
    "publication_ref_col": "publication_name",
}

In [12]:
corpus_dist_results = kiara.run_job('topic_modelling.corpus_distribution', inputs=corpus_dist_inputs, comment = " ")

In [13]:
corpus_dist_results

## 3. Tokenize corpus

In [6]:
! kiara operation explain topic_modelling.tokenize_array


╭─ Operation: [1;3mtopic_modelling.tokenize_array[0m ──────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module creates tokens from an array or from a         │
│  [3m               [0m  table.                                                     │
│  [3m               [0m                                                             │
│  [3m               [0m  It returns a table containing the initial array or         │
│  [3m               [0m  table, and the tokens as a new column. It is possible to   │
│  [3m               [0m  tokenize by word or by character. If not specified,        │
│  [3m               [0m  tokenization is done by word.                              │
│  [3m               [0m                                                             │
│  [3m               [0m  Dependencies:                                              │
│  [3m   

We start by getting an array from the table that contains the content to tokenize.

In [7]:
! kiara operation explain table.pick.column


╭─ Operation: [1;3mtable.pick.column[0m ───────────────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Pick one column from a table, returning an array.          │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m           [0m [1m          [0m [1m          [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescript…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mtable    [0m[3m [0m  table    A table.    [1mye

In [8]:
pick_column_inputs = {
    "table": import_table_from_local_folder_results['table'],
    "column_name": "content"   
}

In [9]:
pick_column_results = kiara.run_job('table.pick.column', inputs=pick_column_inputs, comment = " ")

We can now use the array as input for the tokenize_array operation.

In [10]:
tokenize_array_inputs = {
    "corpus_array": pick_column_results['array'],
    "column_name": "content"   
}

In [11]:
tokenize_array_results = kiara.run_job('topic_modelling.tokenize_array', inputs=tokenize_array_inputs, comment= " ")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariella.decrouychan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
tokenize_array_results

In [13]:
! kiara operation explain topic_modelling.preprocess_tokens


╭─ Operation: [1;3mtopic_modelling.preprocess_tokens[0m ───────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module offers pre-processing options for an array     │
│  [3m               [0m  of tokens.                                                 │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield   [0m[1m [0m [1m         [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname    [0m[1m [0m [1m [0m[1mtype   [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  

In [14]:
preprocess_tokens_inputs = {
    "tokens_array": tokenize_array_results['tokens_array'],
    "lowercase": True,
    "isalpha": True,
    "min_length": 3,  
}

In [15]:
preprocess_tokens_results = kiara.run_job('topic_modelling.preprocess_tokens', inputs=preprocess_tokens_inputs, comment= " ")

In [16]:
preprocess_tokens_results

## 4. Remove stopwords

### 4.1. Create stopwords list

In [17]:
! kiara operation explain topic_modelling.stopwords_list


╭─ Operation: [1;3mtopic_modelling.stopwords_list[0m ──────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module creates a stop words list and enables to       │
│  [3m               [0m  combine predefined stop words lists from nltk and/or a     │
│  [3m               [0m  custom additional stop words list.                         │
│  [3m               [0m                                                             │
│  [3m               [0m  Dependencies:                                              │
│  [3m               [0m                                                             │
│  [3m               [0m  [1;33m • [0mNLTK: https://www.nltk.org/                             │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                      

In [18]:
stopwords_list_inputs = {
    "languages": ["english","italian"],
    "stopwords_list": ["test","test"]  
}

In [19]:
stopwords_list_results = kiara.run_job('topic_modelling.stopwords_list', inputs=stopwords_list_inputs, comment= " ")

In [20]:
stopwords_list_results

### 4.2. Remove stopwords

In [21]:
! kiara operation explain topic_modelling.remove_stopwords


╭─ Operation: [1;3mtopic_modelling.remove_stopwords[0m ────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module removes stop words from an array of tokens.    │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m       [0m [1m           [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype [0m[1m [0m [1m [0m[1mdescript…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mstopwords[0m[3m [0m  list    A list of   [1myes

In [22]:
remove_stopwords_inputs = {
    "tokens_array": preprocess_tokens_results['tokens_array'],
    "stopwords_list": stopwords_list_results["stopwords_list"] 
}

In [23]:
remove_stopwords_results = kiara.run_job('topic_modelling.remove_stopwords', inputs=remove_stopwords_inputs, comment= " ")

In [24]:
remove_stopwords_results

# 5. LDA

In [25]:
! kiara operation explain topic_modelling.lda 


╭─ Operation: [1;3mtopic_modelling.lda[0m ─────────────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  https://radimrehurek.com/gensim/models/ldamulticore.html   │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield   [0m[1m [0m [1m         [0m [1m          [0m [1m          [0m [1m           [0m    │
│  [3m               [0m   [1m [0m[1mname    [0m[1m [0m [1m [0m[1mtype   [0m[1m [0m [1m [0m[1mdescrip…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault  [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mtokens_a[0m[3m [0m  array     Array      [1myes

In [26]:
lda_inputs = {
    "tokens_array": remove_stopwords_results['tokens_array'],
    "num_topics": 3,
    "passes": 20,
    "chunksize": 30 
}

In [27]:
lda_results = kiara.run_job('topic_modelling.lda', inputs=lda_inputs, comment= " ")

In [28]:
lda_results