In [25]:
import json
from json import JSONDecoder

import sys
sys.path.append('../')
sys.path.append('../andre')
import andre.utils as utils

# json extraction

In [26]:
# Specify the path to the JSON file
file_path = '../book_themes_alignment_data/thema_v1.5_fr.json'


class CustomDecoder(JSONDecoder):
    def decode(self, s):
        s = s.replace('\xa0', ' ')
        return super().decode(s)

# Read the JSON file
with open(file_path, 'r', encoding="utf8") as file:
    data = json.load(file, cls=CustomDecoder)

print(data.keys())
# Access the extracted data
thema_codes = data["CodeList"]["ThemaCodes"]["Code"]

# Print the extracted data
print(len(thema_codes))


dict_keys(['CodeList'])
8645


In [20]:
thema_codes

[{'CodeValue': 'A',
  'CodeDescription': 'Arts',
  'CodeNotes': 'Utilisez tous les codes A* pour les ouvrages spécialisés et généraux, qu’ils soient richement illustrés ou majoritairement textuel. Préférez les codes WF* pour les ouvrages liés à un loisir ou un passe-temps, en le complétant au besoin par le(s) code(s) A*. Utilisez au besoin tous les codes A* avec d’autres codes et qualificateurs, particulièrement avec les qualificateurs de style 6*, de lieux 1* et historiques 3*',
  'CodeParent': '',
  'IssueNumber': 1,
  'Modified': 1.4},
 {'CodeValue': 'AB',
  'CodeDescription': 'Arts\xa0: généralités',
  'CodeNotes': '',
  'CodeParent': 'A',
  'IssueNumber': 1,
  'Modified': 1.5},
 {'CodeValue': 'ABA',
  'CodeDescription': 'Théorie de l’art',
  'CodeNotes': 'Lié\xa0: QDTN',
  'CodeParent': 'AB',
  'IssueNumber': 1,
  'Modified': ''},
 {'CodeValue': 'ABC',
  'CodeDescription': 'Conservation, restauration et entretien d’œuvres d’art',
  'CodeNotes': 'Utilisez tous les codes A* pour les

In [5]:
thema_dict = {}
for node in thema_codes:
    thema_dict[node["CodeValue"]] = node
thema_dict

{'A': {'CodeValue': 'A',
  'CodeDescription': 'The Arts',
  'CodeNotes': 'Use all A* codes for: specialist and general adult titles, including both highly illustrated and more text-based works. For a hobby or recreational approach, prefer a WF* code as the main subject and supplement with A* code(s) where appropriate. Use all A* codes with: other subject categories and qualifiers as appropriate, in particular STYLE 6*, plus PLACE 1* and TIME PERIOD 3* Qualifiers',
  'CodeParent': '',
  'IssueNumber': 1,
  'Modified': 1.4},
 'AB': {'CodeValue': 'AB',
  'CodeDescription': 'The arts: general topics',
  'CodeNotes': '',
  'CodeParent': 'A',
  'IssueNumber': 1,
  'Modified': 1.5},
 'ABA': {'CodeValue': 'ABA',
  'CodeDescription': 'Theory of art',
  'CodeNotes': 'See also: QDTN',
  'CodeParent': 'AB',
  'IssueNumber': 1,
  'Modified': ''},
 'ABC': {'CodeValue': 'ABC',
  'CodeDescription': 'Conservation, restoration and care of artworks',
  'CodeNotes': 'Use with: other A* codes for works abo

In [6]:
len(thema_dict)

8645

In [7]:
import re
from collections import defaultdict

depth_dict = defaultdict(list)
for code in thema_dict:
    code_temp = str(code)
    
    if "-" in code_temp:
        # print(code)
        code_temp = re.sub(r'-.*-', 'Z', code_temp)  # replace "-*-" pattern with "Z"
        # print(code_temp)
        # print(len(code_temp)-1)
    else:
        code_length = len(code_temp) - 1 # root has depth 0

    depth_dict[code_length].append(code)
    
depth_dict

defaultdict(list,
            {0: ['A',
              'C',
              'D',
              'F',
              'G',
              'J',
              'K',
              'L',
              'M',
              'N',
              'P',
              'Q',
              'R',
              'S',
              'T',
              'U',
              'V',
              'W',
              'X',
              'Y',
              1,
              2,
              3,
              4,
              5,
              6],
             1: ['AB',
              'AF',
              'AG',
              'AJ',
              'AK',
              'AM',
              'AT',
              'AV',
              'CB',
              'CF',
              'CJ',
              'DB',
              'DC',
              'DD',
              'DN',
              'DS',
              'FB',
              'FC',
              'FD',
              'FF',
              'FG',
              'FH',
              'FJ',
              'FK',
             

In [8]:
from tabulate import tabulate
for key in depth_dict:
    print(key, len(depth_dict[key]))    
    table = []
    for key in depth_dict:
        table.append([key, len(depth_dict[key])])

    table_headers = ["Depth", "Count"]
    table_str = tabulate(table, headers=table_headers, tablefmt="grid")

print(table_str)


0 26
1 861
2 1468
3 4594
4 1456
5 213
6 27
+---------+---------+
|   Depth |   Count |
|       0 |      26 |
+---------+---------+
|       1 |     861 |
+---------+---------+
|       2 |    1468 |
+---------+---------+
|       3 |    4594 |
+---------+---------+
|       4 |    1456 |
+---------+---------+
|       5 |     213 |
+---------+---------+
|       6 |      27 |
+---------+---------+


# all nodes

## description

In [9]:
all_nodes_descriptions = [node["CodeDescription"] for node in thema_codes]
all_nodes_descriptions

['The Arts',
 'The arts: general topics',
 'Theory of art',
 'Conservation, restoration and care of artworks',
 'Forgery, falsification and theft of artworks',
 'Art: financial aspects',
 'The Arts: art forms',
 'Paintings and painting',
 'Paintings and painting in watercolours or pastels',
 'Paintings and painting in oils',
 'Murals and wall paintings',
 'Paintings and painting in ink',
 'Drawing and drawings',
 'Drawing and drawings in pencil, charcoal, crayon or pastel',
 'Drawing and drawings in pen or brush and ink',
 'Prints and printmaking',
 'Other graphic or visual art forms',
 'Body art and tattoos',
 'Non-graphic and electronic art forms',
 'Sculpture',
 'Carvings, masks, reliefs',
 'Precious metal, precious stones and jewellery: artworks and design',
 'Installation art',
 'Performance art',
 'Digital, video and new media arts',
 'Ceramics, mosaic and glass: artworks',
 'Decorative arts',
 'Textile artworks',
 'The Arts: treatments and subjects',
 'History of art',
 'Individ

In [10]:
all_nodes_descriptions_preprocessed = utils.preprocess_theme_list(all_nodes_descriptions)
all_nodes_descriptions_preprocessed = list(set(all_nodes_descriptions_preprocessed))
print(all_nodes_descriptions_preprocessed)

['nuxalk', 'hydroponics', 'haute-normandie', 'morelos', 'slovenian', 'wildlife:', 'comfort', 'beheira', 'nyankore', 'breda', 'lozère', 'bourgogne-franche-comté', 'signal', 'durham', 'nuremburg', 'biobío', 'communications', 'nicosia', 'lusaka', 'irregular', 'sanctioning', 'babylonia', 'lunigiana', 'macgillycuddy’s', 'nagasaki', 'vire-normandie', 'montenegrin', '13th', 'istria', 'realism', 'jharkhand', 'violin', '711–1492', 'apiculture', 'epic', 'wallonian', 'alsace', 'nepal', 'mallorca', 'tahitian', 'project-based', 'tremiti', 'media', 'revolutions', 'mathematics', 'fauske', 'pomeranian', 'hebei', 'archaeological', 'cardiothoracic', 'republic', 'holocaust', 'tokyo', '1945–1989', 'dublin', '1861–1877', 'spanish-based', 'ithaca', 'hokkaido', 'santo', 'mathematical', 'thionville', 'tagalog', 'montevideo', 'military', 'windsor', 'cooking', 'mauritania', 'arachnids', 'development', 'blues', 'dunkirk', 'kursk', 'lent', 'mumbai', 'kendo', 'bergisches', 'hospitality', 'manufacturing', 'vegan', 

In [11]:
len(all_nodes_descriptions_preprocessed)

4914

## description and notes

In [29]:
all_nodes_descriptions_notes_raw = {}
for node in thema_codes:
    all_nodes_descriptions_notes_raw[node["CodeDescription"]] = node["CodeNotes"]

In [30]:
all_nodes_descriptions_notes_raw

{'Arts': 'Utilisez tous les codes A* pour les ouvrages spécialisés et généraux, qu’ils soient richement illustrés ou majoritairement textuel. Préférez les codes WF* pour les ouvrages liés à un loisir ou un passe-temps, en le complétant au besoin par le(s) code(s) A*. Utilisez au besoin tous les codes A* avec d’autres codes et qualificateurs, particulièrement avec les qualificateurs de style 6*, de lieux 1* et historiques 3*',
 'Arts : généralités': '',
 'Théorie de l’art': 'Lié : QDTN',
 'Conservation, restauration et entretien d’œuvres d’art': 'Utilisez tous les codes A* pour les ouvrages sur la conservation, la préservation, la rénovation, la restauration ou l’entretien de tout type d’art, y compris les bâtiments, les structures, les sculptures, les photos, les peintures, les arts décoratifs, etc.',
 'Contrefaçon, falsification et vol d’œuvres d’art': '',
 'Art : aspects financiers': 'Classez ici : les ventes d’art et vente aux enchères, le financement, le parrainage et le sponsoring

In [31]:
import json

# Specify the path to the JSON file
file_path = 'thema_all_nodes_descriptions_notes_raw_fr.json'

# Create a dictionary

# Dump the dictionary to JSON
with open(file_path, 'w') as file:
    json.dump(all_nodes_descriptions_notes_raw, file)


# root nodes

In [4]:
root_nodes = [node for node in thema_codes if node["CodeParent"] == ""]
root_nodes

[{'CodeValue': 'A',
  'CodeDescription': 'The Arts',
  'CodeNotes': 'Use all A* codes for: specialist and general adult titles, including both highly illustrated and more text-based works. For a hobby or recreational approach, prefer a WF* code as the main subject and supplement with A* code(s) where appropriate. Use all A* codes with: other subject categories and qualifiers as appropriate, in particular STYLE 6*, plus PLACE 1* and TIME PERIOD 3* Qualifiers',
  'CodeParent': '',
  'IssueNumber': 1,
  'Modified': 1.4},
 {'CodeValue': 'C',
  'CodeDescription': 'Language and Linguistics',
  'CodeNotes': 'Use all C* codes for: specialist and general adult titles. DO NOT USE: code ‘C’ itself, but select specific categories from section C*. Use all C* codes with: other subject categories and qualifiers where appropriate, in particular LANGUAGE 2* Qualifiers plus PLACE 1*, TIME PERIOD 3* and EDUCATIONAL PURPOSE 4* Qualifiers',
  'CodeParent': '',
  'IssueNumber': 1,
  'Modified': 1.4},
 {'Cod

In [5]:
root_nodes_descriptions = [node["CodeDescription"] for node in root_nodes]
root_nodes_descriptions

['The Arts',
 'Language and Linguistics',
 'Biography, Literature and Literary studies',
 'Fiction and Related items',
 'Reference, Information and Interdisciplinary subjects',
 'Society and Social Sciences',
 'Economics, Finance, Business and Management',
 'Law',
 'Medicine and Nursing',
 'History and Archaeology',
 'Mathematics and Science',
 'Philosophy and Religion',
 'Earth Sciences, Geography, Environment, Planning',
 'Sports and Active outdoor recreation',
 'Technology, Engineering, Agriculture, Industrial processes',
 'Computing and Information Technology',
 'Health, Relationships and Personal development',
 'Lifestyle, Hobbies and Leisure',
 'Graphic novels, Comic books, Manga, Cartoons',
 'Children’s, Teenage and Educational',
 'Place qualifiers',
 'Language qualifiers',
 'Time period qualifiers',
 'Educational purpose qualifiers',
 'Interest qualifiers',
 'Style qualifiers']

In [6]:
utils.preprocess_theme_list(root_nodes_descriptions)

['arts',
 'language',
 'biography',
 'fiction',
 'reference',
 'society',
 'economics',
 'law',
 'medicine',
 'history',
 'mathematics',
 'philosophy',
 'earth',
 'sports',
 'technology',
 'computing',
 'health',
 'lifestyle',
 'graphic',
 'children’s',
 'place',
 'language',
 'time',
 'educational',
 'interest',
 'style']

In [7]:
node_descriptions = [node["CodeDescription"] for node in thema_codes]
node_descriptions

['The Arts',
 'The arts: general topics',
 'Theory of art',
 'Conservation, restoration and care of artworks',
 'Forgery, falsification and theft of artworks',
 'Art: financial aspects',
 'The Arts: art forms',
 'Paintings and painting',
 'Paintings and painting in watercolours or pastels',
 'Paintings and painting in oils',
 'Murals and wall paintings',
 'Paintings and painting in ink',
 'Drawing and drawings',
 'Drawing and drawings in pencil, charcoal, crayon or pastel',
 'Drawing and drawings in pen or brush and ink',
 'Prints and printmaking',
 'Other graphic or visual art forms',
 'Body art and tattoos',
 'Non-graphic and electronic art forms',
 'Sculpture',
 'Carvings, masks, reliefs',
 'Precious metal, precious stones and jewellery: artworks and design',
 'Installation art',
 'Performance art',
 'Digital, video and new media arts',
 'Ceramics, mosaic and glass: artworks',
 'Decorative arts',
 'Textile artworks',
 'The Arts: treatments and subjects',
 'History of art',
 'Individ