# Analyse a gedcom file
## 1. Install

In [394]:
# %pip install -U fastgedcom
# %pip install -U pandas
# %pip install -U plotly
# %pip install -U numpy
# %pip install -U treelib
%pip list | grep "fastgedcom\|pandas\|plotly\|numpy\|treelib"

fastgedcom                    0.0.4
numpy                         1.24.3
pandas                        2.0.1
pandas-gbq                    0.19.2
plotly                        5.14.1
treelib                       1.6.4
Note: you may need to restart the kernel to use updated packages.


## 2. Import Python functions

In [395]:
from datetime import datetime

from fastgedcom.family_aid import FamilyAid
from fastgedcom.base import FakeLine, TrueLine, is_true
from fastgedcom.parser import guess_encoding, parse
from plotly.express import bar, sunburst, violin
from plotly.graph_objects import Figure, Violin
from pandas import DataFrame, to_datetime
import plotly.io as pio
from numpy import timedelta64, nan


## 3. Input Gedcom file

In [396]:
GEDCOM_FILE = "/Users/vv/Downloads/VESSELINOV Family Tree-11.ged"

## 4. Load raw Gedcom file as a Dataframe

In [397]:
pio.templates.default = "plotly_dark"

with open(GEDCOM_FILE, "r", encoding=guess_encoding(GEDCOM_FILE)) as f:
    document, warnings = parse(f)

if warnings:
    print("Warnings: ", *warnings, sep="\n", end="---\n")

# booster = FamilyAid(gedcom)
# print(booster)
rows = []

for individual in document.get_records("INDI"):
    name = (document[individual.tag] > "NAME") >= "GIVN"
    surname = (document[individual.tag] > "NAME") >= "SURN"
    birth_date = (document[individual.tag] > "BIRT") >= "DATE"
    family_c = (document[individual.tag] > "FAMC").__str__().split(" ")[-1]
    family_p = (document[individual.tag] > "FAMS").__str__().split(" ")[-1]
    birth_place = (document[individual.tag] > "BIRT") >= "PLAC"
    death = is_true(document[individual.tag] > "DEAT")
    death_place = (document[individual.tag] > "DEAT") >= "PLAC"
    sex = (document[individual.tag] > "SEX").__str__()[-1]
    # sex = sex[-1]
    death_date = (document[individual.tag] > "DEAT") >= "DATE"

    rows.append(
        {
            "PersonID": individual.tag.replace("@", ""),
            "FirstName": name,
            "FamilyName": surname.upper(),
            "BirthDate": birth_date,
            "BirthPlace": birth_place,
            "DeathDate": death_date,
            "DeathPlace": death_place,
            "Dead": death,
            "Gender": sex,
            "FamilyID": family_c.replace("@", ""),
            "ParentFamilyID": family_p.replace("@", ""),
        }
    )

dtf = DataFrame.from_records(rows)
dtf.replace("<FakeLine>", nan, inplace=True)
dtf.set_index("PersonID", inplace=True)
dtf


Unnamed: 0_level_0,FirstName,FamilyName,BirthDate,BirthPlace,DeathDate,DeathPlace,Dead,Gender,FamilyID,ParentFamilyID
PersonID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
I432449656974,Velizar,VESSELINOV,8 Nov 1975,"Sofia, Sofia City, Bulgaria",,,False,M,F14,F56
I432449657093,Margarita Vladimirova,MATOVA,30 Apr 1938,"Svishtov, Veliko Turnovo, Bulgaria",,,False,F,F63,F14
I432449657204,Valentin,VESSELINOV,25 Sep 1938,"Vratsa, Vratsa, Bulgaria",2017,"Sofia, Sofia City, Bulgaria",True,M,F51,F14
I432449657296,Zoyka,MIHAILOVA,24 Feb 1906,"Ruse, Ruse, Bulgaria",21 Sep 1993,"Sofia, Sofia City, Bulgaria",True,F,F27,F63
I432449657316,Velimir (Vladimir-Vlado),MATOV,8 Apr 1897,"Babitsa, Pernik, Bulgaria",4 Jul 1963,"Sofia, Sofia City, Bulgaria",True,M,F7,F63
...,...,...,...,...,...,...,...,...,...,...
I432487335185,Iliana,STOYANOVA,22 Jul 1958,"Sofia, Sofia City, Bulgaria",,,False,F,,F29
I432487340580,Borislav,BATEMBERGSKI,,"Sofia, Sofia City, Bulgaria",,,False,M,F80,
I432487524615,Adam,MATOV,,,,,False,M,,F108
I432487656907,Ivan,MATOV,1890,"Babitsa, Pernik, Bulgaria",,"Sofia, Sofia City, Bulgaria",True,M,F7,


## 5. Add extra computations of the raw data model

In [398]:
dtf["BirthDT"] = to_datetime(dtf.BirthDate, errors="coerce", format=r"%d %b %Y")
dtf["BirthDay"] = dtf.BirthDT.dt.day.astype("Int64")
print(dtf["BirthDay"])
dtf["BirthMonth"] = dtf.BirthDT.dt.month_name()
dtf["BirthDT2"] = to_datetime(dtf.BirthDate, errors="coerce", format=r"%Y")
dtf.BirthDT = dtf.BirthDT.fillna(dtf.BirthDT2)

dtf["DeathDT"] = to_datetime(dtf.DeathDate, errors="coerce", format=r"%d %b %Y")

dtf["DeathDT2"] = to_datetime(dtf.DeathDate, errors="coerce", format=r"%Y")
dtf.DeathDT = dtf.DeathDT.fillna(dtf.DeathDT2)
dtf.loc[dtf.Dead == False, "DeathDT"] = dtf.DeathDT.fillna(datetime.today())
dtf["LifeDuration"] = (dtf.DeathDT - dtf.BirthDT) / timedelta64(1, "Y")

dtf["Name"] = (
    dtf.FirstName.replace(r"(\(.*\))", "", regex=True) + " " + dtf.FamilyName
)
dtf.loc[dtf.Dead == True, "Status"] = "Deceased"
dtf.loc[dtf.Dead == False, "Status"] = "Living"
dtf.loc[dtf.Dead.isnull(), "Status"] = "Unknown"
dtf.Gender.replace({"M": "Male", "F": "Female", "U": "Unknown"}, inplace=True)
dtf.drop(columns=["BirthDT2", "DeathDT2", "Dead"], inplace=True)

for i in ["Birth", "Death"]:
    dtf = dtf.join(
        dtf[f"{i}Place"]
        .str.split(", ", expand=True)
        .rename(columns={0: f"{i}City", 1: f"{i}Region", 2: f"{i}Country"})
    )

# print(dtf.loc[dtf.BirthPlace.str.contains(r",.*,", regex=True)])
# dtf.loc[dtf.BirthPlace.str.contains(r",.*,", regex=True), "BirthCountry"] = dtf.BirthPlace.str.split(",")[-1]
# dtf.loc[dtf.BirthPlace.str.contains(r",.*,", regex=True), "BirthRegion"] = dtf.BirthPlace.str.split(",")[-2]
# dtf.loc[dtf.BirthPlace.str.contains(r",.*,", regex=True), "BirthCity"] = dtf.BirthPlace.str.split(",")[-3]
dtf

PersonID
I432449656974       8
I432449657093      30
I432449657204      25
I432449657296      24
I432449657316       8
                 ... 
I432487335185      22
I432487340580    <NA>
I432487524615    <NA>
I432487656907    <NA>
I432491469571    <NA>
Name: BirthDay, Length: 299, dtype: Int64


Unnamed: 0_level_0,FirstName,FamilyName,BirthDate,BirthPlace,DeathDate,DeathPlace,Gender,FamilyID,ParentFamilyID,BirthDT,...,DeathDT,LifeDuration,Name,Status,BirthCity,BirthRegion,BirthCountry,DeathCity,DeathRegion,DeathCountry
PersonID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
I432449656974,Velizar,VESSELINOV,8 Nov 1975,"Sofia, Sofia City, Bulgaria",,,Male,F14,F56,1975-11-08,...,2023-05-27 20:27:08.343798,47.583157,Velizar VESSELINOV,Living,Sofia,Sofia City,Bulgaria,,,
I432449657093,Margarita Vladimirova,MATOVA,30 Apr 1938,"Svishtov, Veliko Turnovo, Bulgaria",,,Female,F63,F14,1938-04-30,...,2023-05-27 20:27:08.343798,85.133842,Margarita Vladimirova MATOVA,Living,Svishtov,Veliko Turnovo,Bulgaria,,,
I432449657204,Valentin,VESSELINOV,25 Sep 1938,"Vratsa, Vratsa, Bulgaria",2017,"Sofia, Sofia City, Bulgaria",Male,F51,F14,1938-09-25,...,2017-01-01 00:00:00.000000,78.323288,Valentin VESSELINOV,Deceased,Vratsa,Vratsa,Bulgaria,Sofia,Sofia City,Bulgaria
I432449657296,Zoyka,MIHAILOVA,24 Feb 1906,"Ruse, Ruse, Bulgaria",21 Sep 1993,"Sofia, Sofia City, Bulgaria",Female,F27,F63,1906-02-24,...,1993-09-21 00:00:00.000000,87.632877,Zoyka MIHAILOVA,Deceased,Ruse,Ruse,Bulgaria,Sofia,Sofia City,Bulgaria
I432449657316,Velimir (Vladimir-Vlado),MATOV,8 Apr 1897,"Babitsa, Pernik, Bulgaria",4 Jul 1963,"Sofia, Sofia City, Bulgaria",Male,F7,F63,1897-04-08,...,1963-07-04 00:00:00.000000,66.279452,Velimir MATOV,Deceased,Babitsa,Pernik,Bulgaria,Sofia,Sofia City,Bulgaria
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
I432487335185,Iliana,STOYANOVA,22 Jul 1958,"Sofia, Sofia City, Bulgaria",,,Female,,F29,1958-07-22,...,2023-05-27 20:27:08.343798,64.892746,Iliana STOYANOVA,Living,Sofia,Sofia City,Bulgaria,,,
I432487340580,Borislav,BATEMBERGSKI,,"Sofia, Sofia City, Bulgaria",,,Male,F80,,NaT,...,2023-05-27 20:27:08.343798,,Borislav BATEMBERGSKI,Living,Sofia,Sofia City,Bulgaria,,,
I432487524615,Adam,MATOV,,,,,Male,,F108,NaT,...,2023-05-27 20:27:08.343798,,Adam MATOV,Living,,,,,,
I432487656907,Ivan,MATOV,1890,"Babitsa, Pernik, Bulgaria",,"Sofia, Sofia City, Bulgaria",Male,F7,,1890-01-01,...,NaT,,Ivan MATOV,Deceased,Babitsa,Pernik,Bulgaria,Sofia,Sofia City,Bulgaria


In [399]:
# from treelib import Node, Tree

# tree = Tree()

# def per_row(row):
#     tree.create_node(row["Name"], row.name, parent=row["ParentFamilyID"])

# dtf.apply(per_row, axis=1)

# # tree.create_node("Jane", "jane", parent="harry")
# tree.show()

## 6. Plot life duration expectation

In [400]:
fig = bar(
    dtf[dtf.LifeDuration.notnull()],
    y="LifeDuration",
    x="Name",
    color="Status",
)

fig.update_layout(xaxis={"categoryorder": "total descending"})
fig.write_html("FamilyLiveExpectationHistogram.html")
fig

In [401]:
# fig = violin(
#     dtf,
#     y="LifeDuration",
#     x="Status",
#     color="Gender",
#     box=True,
#     points="all",
#     hover_data=dtf.columns,
# )

fig = Figure()

fig.add_trace(
    Violin(
        x=dtf["Status"][dtf["Gender"] == "Male"],
        y=dtf["LifeDuration"][dtf["Gender"] == "Male"],
        # legendgroup="Gender",
        # scalegroup="Yes",
        name="Male",
        side="negative",
        line_color="blue",
        box_visible=True,
    )
)

fig.add_trace(
    Violin(
        x=dtf["Status"][dtf["Gender"] == "Female"],
        y=dtf["LifeDuration"][dtf["Gender"] == "Female"],
        # legendgroup="Gender",
        # scalegroup="Yes",
        name="Female",
        side="positive",
        line_color="pink",
    )
)

fig.update_yaxes(range=[0, 100], title="Age (years)")
fig.update_traces(meanline_visible=True, box_visible=True, points="all")
fig.update_layout(
    violingap=0, violinmode="overlay", title_text="Life duration expectation"
)
fig.write_html("FamilyLiveExpectationViolin.html")
fig


In [402]:
dtf2 = (
    (
        dtf[["BirthCity"]]
        .groupby(["BirthCity"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)
# print(dtf2)
# dtf2
# fig = bar(dtf2, y="Count", x="BirthPlace", title="My family tree origins")
fig = bar(dtf2, y="Count", x="BirthCity", title="My family tree origins")
fig.write_html("OriginHistogram.html")
fig


In [407]:
dtf3 = (
    (
        dtf[["BirthCity", "BirthRegion", "BirthCountry"]]
        # .fillna("Unknown")
        .groupby(["BirthCity", "BirthRegion", "BirthCountry"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)
# print(dtf2)
# dtf2
# fig = bar(dtf2, y="Count", x="BirthPlace", title="My family tree origins")
fig = sunburst(
    dtf3,
    path=["BirthCountry", "BirthRegion", "BirthCity"],
    values="Count",
    title="My family tree origins",
)
fig.update_traces(textinfo="label+percent entry")
fig.write_html("OriginSunburst.html")
fig


In [404]:
dtf3 = (
    (
        dtf[["FamilyName", "BirthCity"]]
        .groupby(["FamilyName", "BirthCity"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)

# Slavic name regroup:
dtf3.FamilyName = dtf3.FamilyName.str.replace("V$", "VI", regex=True)
dtf3.FamilyName = dtf3.FamilyName.str.replace("VA$", "VI", regex=True)
dtf3.FamilyName = dtf3.FamilyName.str.replace("SKA$", "SKI", regex=True)

fig = sunburst(
    dtf3,
    path=["FamilyName", "BirthCity"],
    values="Count",
    title="My family tree origins",
)
fig.update_traces(textinfo="label+percent entry")
fig.write_html("FamilyNameSunburst.html")
fig

In [405]:
dtf3 = (
    (
        dtf[["BirthMonth", "BirthDay"]]
        .groupby(["BirthMonth", "BirthDay"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)

fig = sunburst(
    dtf3,
    path=["BirthMonth", "BirthDay"],
    values="Count",
    title="Family bitrth month",
)
fig.update_traces(textinfo="label+percent entry")
fig.write_html("BirthMonthSunburst.html")
fig

In [406]:
dtf3 = (
    (
        dtf[["BirthDay", "BirthMonth"]]
        .groupby(["BirthDay", "BirthMonth"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)

fig = sunburst(
    dtf3,
    path=["BirthDay", "BirthMonth"],
    values="Count",
    title="Family birth day",
)
fig.update_traces(textinfo="label+percent entry")
fig.write_html("BirthDaySunburst.html")
fig