In [None]:
# install the package by running the command "pip install git+https://github.com/DHARPA-Project/kiara_plugin.topic_modelling"
# in your virtual environment.
# Alternatively, uncomment and run the line below.
# ! pip install git+https://github.com/DHARPA-Project/kiara_plugin.topic_modelling

: 

In [1]:
from kiara.api import KiaraAPI
kiara = KiaraAPI.instance()

## 1. Data onboarding

Choose one of the two options below (1.1, 1.2 or 1.3) to onboard data.
Option 1.3 uses example data present in the topic modelling plugin.

### 1.1. Get files from zenodo

In [4]:
! kiara operation explain topic_modelling.create_table_from_zenodo


╭─ Operation: [1;3mtopic_modelling.create_table_from_zenodo[0m ────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module retrieves text files from a specified folder   │
│  [3m               [0m  hosted on Zenodo.                                          │
│  [3m               [0m                                                             │
│  [3m               [0m  It takes the DOI and the name of the file as inputs. It    │
│  [3m               [0m  outputs a table with two columns: one for the file names   │
│  [3m               [0m  and the other for the content of these files.              │
│  [3m               [0m                                                             │
│  [3m               [0m  Dependencies:                                              │
│  [3m               [0m                                                             │
│  [3m   

In [5]:
create_table_from_zenodo_inputs = {
    "doi": "4596345",
    "file_name": "ChroniclItaly_3.0_original.zip"
}

In [6]:
create_table_from_zenodo_results = kiara.run_job('topic_modelling.create_table_from_zenodo', inputs=create_table_from_zenodo_inputs)

In [7]:
create_table_from_zenodo_results

In [8]:
corpus_table_zenodo = create_table_from_zenodo_results['corpus_table']

### 1.2. Get files from github

In [2]:
! kiara operation explain download.file_bundle.from.github


╭─ Operation: [1;3mdownload.file_bundle.from.github[0m ────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Download a file bundle from a remote github repository.    │
│  [3m               [0m                                                             │
│  [3m               [0m  If 'sub_path' is not specified, the whole repo will be     │
│  [3m               [0m  used.                                                      │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m           [0m [1m          [0m [1m          [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescript…[0m

In [3]:
dl_file_bundle_gh_inputs = {
    "user": "DHARPA-Project",
    "repo": "kiara.examples",
    "sub_path": "kiara.examples-main/examples/workshops/dh_benelux_2023/data",
    "include_files": ["txt"]
}

In [5]:
dl_file_bundle_gh_results = kiara.run_job('download.file_bundle.from.github', inputs=dl_file_bundle_gh_inputs, comment=" ")

In [6]:
dl_file_bundle_gh_results

In [7]:
dl_file_bundle_gh = dl_file_bundle_gh_results['file_bundle']

### 1.3 Get files from file bundle

In [8]:
! kiara operation explain create.table.from.file_bundle


╭─ Operation: [1;3mcreate.table.from.file_bundle[0m ───────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Create a table value from a text file_bundle.              │
│  [3m               [0m                                                             │
│  [3m               [0m  The resulting table will have (at a minimum) the           │
│  [3m               [0m  following columns:                                         │
│  [3m               [0m                                                             │
│  [3m               [0m  [1;33m • [0mid: an auto-assigned index                              │
│  [3m               [0m  [1;33m • [0mrel_path: the relative path of the file (from the       │
│  [3m               [0m  [1;33m   [0mprovided base path)                                     │
│  [3m               [0m  [1;33m • [0mcontent: the text file c

In [9]:
# replace path with your local path to the data folder
create_table_from_bundle_inputs = {
    "file_bundle": dl_file_bundle_gh
}

In [12]:
create_table_from_bundle_results = kiara.run_job('create.table.from.file_bundle', inputs=create_table_from_bundle_inputs, comment=" ")

In [13]:
create_table_from_bundle_results

## 2. Tokenize corpus

In [14]:
! kiara operation explain topic_modelling.tokenize_array


╭─ Operation: [1;3mtopic_modelling.tokenize_array[0m ──────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  This module creates tokens from an array or from a         │
│  [3m               [0m  table.                                                     │
│  [3m               [0m                                                             │
│  [3m               [0m  It returns a table containing the initial array or         │
│  [3m               [0m  table, and the tokens as a new column. It is possible to   │
│  [3m               [0m  tokenize by word or by character. If not specified,        │
│  [3m               [0m  tokenization is done by word.                              │
│  [3m               [0m                                                             │
│  [3m               [0m  Dependencies:                                              │
│  [3m   

We start by getting an array from the table that contains the content to tokenize.

In [15]:
! kiara operation explain table.pick.column


╭─ Operation: [1;3mtable.pick.column[0m ───────────────────────────────────────────────╮
│                                                                              │
│  [3m [0m[3mDocumentation[0m[3m [0m  Pick one column from a table, returning an array.          │
│                                                                              │
│  [3m [0m[3mInputs       [0m[3m [0m                                                             │
│  [3m               [0m   [1m [0m[1mfield    [0m[1m [0m [1m        [0m [1m           [0m [1m          [0m [1m          [0m    │
│  [3m               [0m   [1m [0m[1mname     [0m[1m [0m [1m [0m[1mtype  [0m[1m [0m [1m [0m[1mdescript…[0m[1m [0m [1m [0m[1mRequired[0m[1m [0m [1m [0m[1mDefault [0m[1m [0m    │
│  [3m               [0m   ──────────────────────────────────────────────────────    │
│  [3m               [0m   [3m [0m[3mtable    [0m[3m [0m  table    A table.    [1mye

In [16]:
pick_column_inputs = {
    "table": create_table_from_bundle_results['table'],
    "column_name": "content"   
}

In [17]:
pick_column_results = kiara.run_job('table.pick.column', inputs=pick_column_inputs, comment = " ")

We can now use the array as input for the tokenize_array operation.

In [18]:
tokenize_array_inputs = {
    "corpus_array": pick_column_results['array'],
    "column_name": "content"   
}

In [19]:
tokenize_array_results = kiara.run_job('topic_modelling.tokenize_array', inputs=tokenize_array_inputs, comment= " ")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariella.decrouychan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
tokenize_array_results