In [188]:
import json
import os
import pandas as pd

In [189]:
# Define the directory path for saving CSV files
output_dir = 'src/data/csv/'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [190]:
# Load the data from the JSON files
with open('src/data/json/characters.json', 'r', encoding='utf-8') as file:
    characters_data = json.load(file)[0]

with open('src/data/json/chapters.json', 'r', encoding='utf-8') as file:
    chapters_data = json.load(file)

In [191]:
characters = []

# Extract the data for each character
for character_key, character_value in characters_data.get('humans', {}).items():
    character = {'id': character_key, **character_value}
    characters.append(character)

# Convert the data to a DataFrame
characters_df = pd.DataFrame(characters)

# Save the data to a CSV file
characters_df.to_csv(output_dir + 'characters_test.csv', index=False)
characters_df.head()

Unnamed: 0,id,name,alias(es),gender,house(s),allegiance(s),title(s),family,race,culture(s),religion,physical description,birth,death,first appearance,last appearance
0,Aegon I Targaryen,Aegon Targaryen,"[{'book': 'A Game of Thrones', 'chapter': 3, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",{'father': 'Aerion Targaryen (son of Daemion)'...,Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Faith of the Seven,"{'eye color': 'Purple', 'hair color': 'Silver-...","{'continent': 'Westeros', 'region': 'Crownland...","{'continent': 'Westeros', 'region': 'The Crown...",,
1,Aegon II Targaryen,Aegon Targaryen,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",,"[{'book': 'A Game of Thrones', 'chapter': 0, '...","{'father': 'Viserys I Targaryen', 'mother': 'A...",Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Faith of the Seven,"{'eye color': 'Violet', 'hair color': 'Silver-...","{'continent': 'Westeros', 'region': 'The Crown...","{'continent': 'Westeros', 'region': 'The Crown...",,
2,Aegon Targaryen (son of Rhaegar),Aegon Targaryen,"[{'book': 'A Clash of Kings', 'chapter': 48, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","{'father': 'Rhaegar Targaryen', 'mother': 'Eli...",Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",,"{'eye color': 'Purple', 'hair color': 'Silver-...","{'continent': 'Westeros', 'region': 'Crownland...","{'continent': 'Westeros', 'region': 'The Crown...",,
3,Aemon Targaryen (son of Maekar I),Aemon Targaryen,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': None, 'chapter': None, 'title': 'Mae...","{'father': 'Maekar I Targaryen', 'mother': 'Dy...",Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Faith of the Seven,"{'eye color': 'Milk white', 'hair color': 'Whi...","{'continent': 'Westeros', 'region': 'The Crown...","{'continent': 'Westeros', 'region': 'The Crown...",,
4,Aemon Targaryen (son of Viserys II),Aemon Targaryen,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","{'father': 'Viserys II Targaryen', 'mother': '...",Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Faith of the Seven,"{'eye color': 'Purple', 'hair color': 'Silver-...","{'continent': 'Westeros', 'region': 'The Crown...","{'continent': None, 'region': None, 'location'...",,


In [192]:
def flatten_or_none(nested_lists):
    flat = []

    def recurse(items):
        if items:
            for item in items:
                if isinstance(item, list):
                    recurse(item)
                elif item is not None:
                    flat.append(item)
    
    recurse(nested_lists)

    return flat if flat else None

def extract_values(item, key):
    """
    Extracts values associated with a specific key from a list of dictionaries or a single dictionary.
    
    Parameters:
    - item: The input data, which can be a list of dictionaries, a single dictionary, or another type.
    - key: The key whose associated values need to be extracted.
    
    Returns:
    - A list of extracted values if the input is a list of dictionaries.
    - A single value if the input is a dictionary.
    - The original item if it's neither a list nor a dictionary.
    """
    if not item:
        return None

    if isinstance(key, tuple):
        if isinstance(item, list):
            return flatten_or_none([extract_values(element.get(key[0]), key[1]) if isinstance(element, dict) else element for element in item])
    
        if isinstance(item, dict):
            return extract_values(item.get(key[0]), key[1])
    
    if isinstance(key, str):
        if isinstance(item, list):
            return flatten_or_none([element.get(key) if isinstance(element, dict) else element for element in item])
    
        if isinstance(item, dict):
            return item.get(key)
    
    if isinstance(key, list):
        results = []
        for k in key:
            results.append(extract_values(item, k))

        return flatten_or_none(results)

def replace_columns(df, old_col, new_cols):
    """
    Replaces an existing column with multiple new columns while preserving the column order.
    
    Parameters:
    - df: The pandas DataFrame.
    - old_col: The name of the column to be replaced.
    - new_cols: A dictionary where keys are new column names and values are the data for these columns.
    
    Returns:
    - None. The DataFrame is modified in place.
    """
    if old_col not in df.columns:
        return
    # Get the index of the column to be replaced
    col_idx = df.columns.get_loc(old_col)
    # Drop the old column
    df.drop(columns=[old_col], inplace=True)
    # Insert each new column at the correct position
    for i, (new_col, data) in enumerate(new_cols.items()):
        df.insert(col_idx + i, new_col, data)

display(characters_df.head())

Unnamed: 0,id,name,alias(es),gender,house(s),allegiance(s),title(s),family,race,culture(s),religion,physical description,birth,death,first appearance,last appearance
0,Aegon I Targaryen,Aegon Targaryen,"[{'book': 'A Game of Thrones', 'chapter': 3, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",{'father': 'Aerion Targaryen (son of Daemion)'...,Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Faith of the Seven,"{'eye color': 'Purple', 'hair color': 'Silver-...","{'continent': 'Westeros', 'region': 'Crownland...","{'continent': 'Westeros', 'region': 'The Crown...",,
1,Aegon II Targaryen,Aegon Targaryen,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",,"[{'book': 'A Game of Thrones', 'chapter': 0, '...","{'father': 'Viserys I Targaryen', 'mother': 'A...",Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Faith of the Seven,"{'eye color': 'Violet', 'hair color': 'Silver-...","{'continent': 'Westeros', 'region': 'The Crown...","{'continent': 'Westeros', 'region': 'The Crown...",,
2,Aegon Targaryen (son of Rhaegar),Aegon Targaryen,"[{'book': 'A Clash of Kings', 'chapter': 48, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","{'father': 'Rhaegar Targaryen', 'mother': 'Eli...",Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",,"{'eye color': 'Purple', 'hair color': 'Silver-...","{'continent': 'Westeros', 'region': 'Crownland...","{'continent': 'Westeros', 'region': 'The Crown...",,
3,Aemon Targaryen (son of Maekar I),Aemon Targaryen,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': None, 'chapter': None, 'title': 'Mae...","{'father': 'Maekar I Targaryen', 'mother': 'Dy...",Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Faith of the Seven,"{'eye color': 'Milk white', 'hair color': 'Whi...","{'continent': 'Westeros', 'region': 'The Crown...","{'continent': 'Westeros', 'region': 'The Crown...",,
4,Aemon Targaryen (son of Viserys II),Aemon Targaryen,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Male,"[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","[{'book': 'A Game of Thrones', 'chapter': 0, '...","{'father': 'Viserys II Targaryen', 'mother': '...",Valyrian,"[{'book': 'A Game of Thrones', 'chapter': 0, '...",Faith of the Seven,"{'eye color': 'Purple', 'hair color': 'Silver-...","{'continent': 'Westeros', 'region': 'The Crown...","{'continent': None, 'region': None, 'location'...",,


In [None]:
characters_mapping = {
    'title(s)': 'title',
    'house(s)': 'house',
    'culture(s)': 'culture',
    'allegiance(s)': 'allegiance',
    'alias(es)': 'alias',
    'family': {
        'father': 'father',
        'mother': 'mother',
        'sibling(s)': ('sibling(s)', 'sibling'),
        'child(ren)': ('child(ren)', 'child'),
        'spouse(s)': ('spouse(s)', 'spouse'),
        'lover(s)': ('lover(s)', 'lover'),
    },
    'birth': {
        'birth_continent': 'continent',
        'birth_region': 'region',
        'birth_location': 'location',
        'birth_date': 'date'
    },
    'death': {
        'death_continent': 'continent',
        'death_region': 'region',
        'death_location': 'location',
        'death_date': 'date',
        'death_reason': ('by', 'type'),
        'killed_by': ('by', 'name')
    },
    'first appearance': {
        'first_appearance_book': 'book',
        'first_appearance_chapter': 'chapter'
    },
    'last appearance': {
        'last_appearance_book': 'book',
        'last_appearance_chapter': 'chapter'
    },
    'physical description': {
        'eye color': 'eye color',
        'hair color': 'hair color'
    }
}

# Transform location-related columns
for old_col, mappings in characters_mapping.items():
    if old_col in characters_df.columns:
        if isinstance(mappings, str):
            characters_df[old_col] = characters_df[old_col].apply(lambda x: extract_values(x, mappings))
        else:   
            new_cols = {}
            for new_col, key in mappings.items():
                new_cols[new_col] = characters_df[old_col].apply(lambda item: extract_values(item, key))
            replace_columns(characters_df, old_col, new_cols)

# Save the transformed data to a CSV file
characters_df.to_csv(output_dir + 'characters_test.csv', index=False)
characters_df.head()

# Print all the values of the first row
for col in characters_df.columns:
    print(f'{col}: {characters_df[col][0]}')

id: Aegon I Targaryen
name: Aegon Targaryen
alias(es): ['Aegon the Conqueror', 'Aegon the Dragonlord', 'Aegon Dragonlord', 'Aegon the Dragon']
gender: Male
house(s): ['Targaryen']
allegiance(s): None
title(s): ['Lord of Dragonstone', 'King of All Westeros', 'Shield of His People', 'King of the Andals, the Rhoynar, and the First Men', 'Lord of the Seven Kingdoms', 'Protector of the Realm']
father: Aerion Targaryen (son of Daemion)
mother: Valaena Velaryon
sibling(s): ['Rhaenys Targaryen', 'Visenya Targaryen']
child(ren): ['Aenys I Targaryen', 'Maegor I Targaryen']
spouse(s): ['Visenya Targaryen', 'Rhaenys Targaryen']
lover(s): None
race: Valyrian
culture(s): ['Crownlander']
religion: Faith of the Seven
eye color: Purple
hair color: Silver-gold
birth_continent: Westeros
birth_region: Crownlands
birth_location: Dragonstone
birth_date: 27 BC
death_continent: Westeros
death_region: The Crownlands
death_location: Dragonstone
death_date: 37 AC
death_reason: Nature
killed_by: Stroke
first_appe

In [194]:
chapters = []

# Extract the data for each chapter
for book in chapters_data.get('books', []):
    for chapter in book.get('chapters', {}):
        chapter = {'book_tite': book.get('title'), 'book_author': book.get('author'), 'original_version': book.get('original_version'), 'folio_society_version': book.get('folio_society_version'), **chapter}
        chapters.append(chapter)

# Convert the data to a DataFrame
chapters_df = pd.DataFrame(chapters)

# Save the data to a CSV file
chapters_df.to_csv(output_dir + 'chapters_test.csv', index=False)
chapters_df.head()

Unnamed: 0,book_tite,book_author,original_version,folio_society_version,number,title,id,pov,original_counts,folio_society_counts,universe_year,summary,theme(s),characters,continents,groups,events
0,A Game of Thrones,George R. R. Martin,"{'publication_date': '1997-08-04', 'page_count...","{'publication_date': '2019-07-16', 'page_count...",0,Prologue,AGOT-0-0,Will,"{'page_count': 11, 'word_count': None}","{'page_count': None, 'word_count': None}",297,While tracking a group of wildling raiders bey...,,"{'direwolves': None, 'humans': {'present': ['W...","{'present': [{'name': 'Westeros', 'regions': {...","{'factions': None, 'houses': {'present': ['Hou...",
1,A Game of Thrones,George R. R. Martin,"{'publication_date': '1997-08-04', 'page_count...","{'publication_date': '2019-07-16', 'page_count...",1,Bran I,AGOT-0-1,Bran Stark,"{'page_count': 9, 'word_count': None}","{'page_count': None, 'word_count': None}",298,Lord Eddard Stark dispenses justice to a Night...,,"{'direwolves': {'present': ['Ghost', 'Grey Win...","{'present': [{'name': 'Westeros', 'regions': {...","{'factions': None, 'houses': {'present': ['Hou...","{'present': None, 'mentioned': [{'type': 'Peri..."
2,A Game of Thrones,George R. R. Martin,"{'publication_date': '1997-08-04', 'page_count...","{'publication_date': '2019-07-16', 'page_count...",2,Catelyn I,AGOT-0-2,Catelyn Stark,"{'page_count': 6, 'word_count': None}","{'page_count': None, 'word_count': None}",298,"Lady Catelyn Tully seeks out her husband, Lord...",,"{'direwolves': None, 'humans': {'present': ['C...","{'present': [{'name': 'Westeros', 'regions': {...","{'factions': None, 'houses': {'present': ['Hou...","{'present': None, 'mentioned': [{'type': 'Cata..."
3,A Game of Thrones,George R. R. Martin,"{'publication_date': '1997-08-04', 'page_count...","{'publication_date': '2019-07-16', 'page_count...",3,Daenerys I,AGOT-0-3,Daenerys Targaryen,"{'page_count': 11, 'word_count': None}","{'page_count': None, 'word_count': None}",298,The exiled Princess Daenerys Targaryen prepare...,,"{'direwolves': None, 'humans': {'present': ['C...","{'present': [{'name': 'Essos', 'regions': {'pr...","{'factions': None, 'houses': {'present': ['Hou...","{'present': None, 'mentioned': [{'type': 'Batt..."
4,A Game of Thrones,George R. R. Martin,"{'publication_date': '1997-08-04', 'page_count...","{'publication_date': '2019-07-16', 'page_count...",4,Eddard I,AGOT-0-4,Eddard Stark,"{'page_count': 10, 'word_count': None}","{'page_count': None, 'word_count': None}",298,King Robert I Baratheon arrives with his party...,,"{'direwolves': None, 'humans': {'present': ['C...","{'present': [{'name': 'Westeros', 'regions': {...","{'factions': None, 'houses': {'present': ['Hou...","{'present': None, 'mentioned': [{'type': 'War'..."


In [195]:
chapters_mapping = {
    'original_version': {
        'original_book_publication_date': 'publication_date',
        'original_book_page_count': 'page_count',
        'original_book_edition': 'edition',
        'original_book_isbn': 'isbn',
        'original_book_language': 'language'
    },
    'folio_society_version': {
        'folio_society_book_publication_date': 'publication_date',
        'folio_society_book_page_count': 'page_count',
        'folio_society_book_edition': 'edition',
        'folio_society_book_isbn': 'isbn',
        'folio_society_book_language': 'language'
    },
    'original_counts': {
        'original_chapter_page_count': 'page_count',
        'original_chapter_word_count': 'word_count'
    },
    'folio_society_counts': {
        'folio_society_chapter_page_count': 'page_count',
        'folio_society_chapter_word_count': 'word_count'
    },
    'characters': {
        'present_direwolves': ('direwolves', 'present'),
        'mentioned_direwolves': ('direwolves', 'mentioned'),
        'present_humans': ('humans', 'present'),
        'mentioned_humans': ('humans', 'mentioned'),
    },
    'continents': {
        'present_continents': ('present', 'name'),
        'mentioned_continents': ('mentioned', 'name'),

        'present_regions': ('present', ('regions', ('present', 'name'))),
        'mentioned_regions': [
            ('mentioned', ('regions', ('mentioned', 'name'))),
            ('present', ('regions', ('mentioned', 'name')))
        ],

        'present_locations': ('present', ('regions', ('present', ('locations', ('present', 'name'))))),
        'mentioned_locations': [
            ('present', ('regions', ('present', ('locations', ('mentioned', 'name'))))),
            ('present', ('regions', ('mentioned', ('locations', ('mentioned', 'name'))))),
            ('mentioned', ('regions', ('mentioned', ('locations', ('mentioned', 'name'))))),
        ],
    },
    'groups': {
        'present_factions': ('factions', 'present'),
        'mentioned_factions': ('factions', 'mentioned'),

        'present_houses': ('houses', 'present'),
        'mentioned_houses': ('houses', 'mentioned'),

        'present_mercenary_companies': ('mercenary companies', 'present'),
        'mentioned_mercenary_companies': ('mercenary companies', 'mentioned'),

        'present_military_orders': ('military orders', 'present'),
        'mentioned_military_orders': ('military orders', 'mentioned'),

        'present_other_orders': ('other orders', 'present'),
        'mentioned_other_orders': ('other orders', 'mentioned'),

        'present_peoples': ('peoples', 'present'),
        'mentioned_peoples': ('peoples', 'mentioned'),

        'present_religious_orders': ('religious orders', 'present'),
        'mentioned_religious_orders': ('religious orders', 'mentioned'),
    },
    'events': {
        'present_events_types': ('present', 'type'),
        'present_events_names': ('present', 'name'),
        'mentioned_events_types': ('mentioned', 'type'),
        'mentioned_events_names': ('mentioned', 'name'),
    }
}

# Transform dictionary columns to multiple columns based on the mappings
for old_col, mappings in chapters_mapping.items():
    if old_col in chapters_df.columns:
        new_cols = {}
        for new_col, key in mappings.items():
            new_cols[new_col] = chapters_df[old_col].apply(lambda item: extract_values(item, key))
        replace_columns(chapters_df, old_col, new_cols)

# Save the transformed data to a CSV file
chapters_df.to_csv(output_dir + 'chapters_test.csv', index=False)
chapters_df.head(11)

Unnamed: 0,book_tite,book_author,original_book_publication_date,original_book_page_count,original_book_edition,original_book_isbn,original_book_language,folio_society_book_publication_date,folio_society_book_page_count,folio_society_book_edition,...,present_other_orders,mentioned_other_orders,present_peoples,mentioned_peoples,present_religious_orders,mentioned_religious_orders,present_events_types,present_events_names,mentioned_events_types,mentioned_events_names
0,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,[Maesters],[The Others],[Wildlings],,,,,,
1,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,,,"[The Andals, The First Men, The Others, The Rh...",,,,,[Period],[The Long Night]
2,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,[Maesters],,"[The children of the Forest, The Others, Wildl...",,[Septons],,,[Cataclysm],[The Doom of Valyria]
3,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,,[The Dothraki],"[The Andals, The First Men, The Rhoynar]",,[Red Priests],,,"[Battle, Cataclysm]","[The Battle of the Trident, The Doom of Valyria]"
4,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,,,[The Others],,,,,[War],[Greyjoy's Rebellion]
5,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,,,,,,,,[War],[The Conquest of Dorne]
6,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,[Maesters],,,[The Others],,,,,,
7,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,,,,,[Septas],,,,
8,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,[Maesters],,,,,,,[War],[The Dance of the Dragons]
9,A Game of Thrones,George R. R. Martin,1997-08-04,864,Unabridged Version,978-0-553-57340-4,English,2019-07-16,824,The Folio Society,...,,[Maesters],,,[Septons],,,,,


In [196]:
# Extraire les personnages humains mentionnés ou apparaissants dans les chapitres
mentioned_or_present_humans = set(chapters_df['present_humans'].explode().dropna().unique()).union(
    set(chapters_df['mentioned_humans'].explode().dropna().unique())
)

# Extraire les personnages présents dans le characters_df
characters_in_df = set(characters_df['id'].dropna().unique())

# Trouver les personnages humains mentionnés ou apparaissants mais pas présents dans le characters_df
missing_characters = mentioned_or_present_humans - characters_in_df

# Afficher les personnages manquants
missing_characters

set()

In [197]:
additional_characters = characters_in_df - mentioned_or_present_humans

# Afficher les personnages supplémentaires
additional_characters

{'Farlen', 'High Septon (fat one)'}