# Analyse a gedcom file
## 1. Install

In [None]:
# %pip install -U fastgedcom
# %pip install -U pandas
# %pip install -U plotly
# %pip install -U numpy
# %pip install -U treelib
%pip list | grep "fastgedcom\|pandas\|plotly\|numpy\|treelib"

## 2. Import Python functions

In [None]:
from datetime import datetime
from os.path import exists

from fastgedcom.family_aid import FamilyAid
from fastgedcom.base import FakeLine, TrueLine, is_true
from fastgedcom.parser import guess_encoding, parse
from plotly.express import bar, sunburst, violin
from plotly.graph_objects import Figure, Violin
from pandas import DataFrame, to_datetime
import plotly.io as pio
from numpy import timedelta64, nan

## 3. Input Gedcom file

In [None]:
GEDCOM_FILE = "/Users/vv/Downloads/VESSELINOV Family Tree-11.ged"


## 4. Load raw Gedcom file as a Dataframe

In [None]:
pio.templates.default = "plotly_dark"

with open(GEDCOM_FILE, "r", encoding=guess_encoding(GEDCOM_FILE)) as f:
    document, warnings = parse(f)

if warnings:
    print("Warnings: ", *warnings, sep="\n", end="---\n")

# booster = FamilyAid(gedcom)
# print(booster)
rows = []

for individual in document.get_records("INDI"):
    name = (document[individual.tag] > "NAME") >= "GIVN"
    surname = (document[individual.tag] > "NAME") >= "SURN"
    birth_date = (document[individual.tag] > "BIRT") >= "DATE"
    family_c = (document[individual.tag] > "FAMC").__str__().split(" ")[-1]
    family_p = (document[individual.tag] > "FAMS").__str__().split(" ")[-1]
    birth_place = (document[individual.tag] > "BIRT") >= "PLAC"
    death = is_true(document[individual.tag] > "DEAT")
    death_place = (document[individual.tag] > "DEAT") >= "PLAC"
    sex = (document[individual.tag] > "SEX").__str__()[-1]
    # sex = sex[-1]
    death_date = (document[individual.tag] > "DEAT") >= "DATE"

    rows.append(
        {
            "PersonID": individual.tag.replace("@", ""),
            "FirstName": name,
            "FamilyName": surname.upper(),
            "BirthDate": birth_date,
            "BirthPlace": birth_place,
            "DeathDate": death_date,
            "DeathPlace": death_place,
            "Dead": death,
            "Gender": sex,
            "FamilyID": family_c.replace("@", ""),
            "ParentFamilyID": family_p.replace("@", ""),
        }
    )

dtf = DataFrame.from_records(rows)
dtf.replace("<FakeLine>", nan, inplace=True)
dtf.set_index("PersonID", inplace=True)
dtf

## 5. Add extra computations of the raw data model

In [None]:
dtf["BirthDT"] = to_datetime(dtf.BirthDate, errors="coerce", format=r"%d %b %Y")
dtf["BirthDay"] = dtf.BirthDT.dt.day.astype("Int64")
print(dtf["BirthDay"])
dtf["BirthMonth"] = dtf.BirthDT.dt.month_name()
dtf["BirthDT2"] = to_datetime(dtf.BirthDate, errors="coerce", format=r"%Y")
dtf.BirthDT = dtf.BirthDT.fillna(dtf.BirthDT2)

dtf["DeathDT"] = to_datetime(dtf.DeathDate, errors="coerce", format=r"%d %b %Y")

dtf["DeathDT2"] = to_datetime(dtf.DeathDate, errors="coerce", format=r"%Y")
dtf.DeathDT = dtf.DeathDT.fillna(dtf.DeathDT2)
dtf.loc[dtf.Dead == False, "DeathDT"] = dtf.DeathDT.fillna(datetime.today())
dtf["LifeDuration"] = (dtf.DeathDT - dtf.BirthDT) / timedelta64(1, "Y")

dtf["Name"] = dtf.FirstName.replace(r"(\(.*\))", "", regex=True) + " " + dtf.FamilyName
dtf.loc[dtf.Dead == True, "Status"] = "Deceased"
dtf.loc[dtf.Dead == False, "Status"] = "Living"
dtf.loc[dtf.Dead.isnull(), "Status"] = "Unknown"
dtf.Gender.replace({"M": "Male", "F": "Female", "U": "Unknown"}, inplace=True)
dtf.drop(columns=["BirthDT2", "DeathDT2", "Dead"], inplace=True)

for i in ["Birth", "Death"]:
    dtf = dtf.join(
        dtf[f"{i}Place"]
        .str.split(", ", expand=True)
        .rename(columns={0: f"{i}City", 1: f"{i}Region", 2: f"{i}Country"})
    )

dtf


In [None]:
# from treelib import Node, Tree

# tree = Tree()

# def per_row(row):
#     tree.create_node(row["Name"], row.name, parent=row["ParentFamilyID"])

# dtf.apply(per_row, axis=1)

# # tree.create_node("Jane", "jane", parent="harry")
# tree.show()


## 6. Plot life duration expectation

In [None]:
fig = bar(
    dtf[dtf.LifeDuration.notnull()],
    y="LifeDuration",
    x="Name",
    color="Status",
)

fig.update_layout(xaxis={"categoryorder": "total descending"})

if exists("../private"):
    fig.write_html("../private/FamilyLiveExpectationHistogram.html")

fig


In [None]:
# fig = violin(
#     dtf,
#     y="LifeDuration",
#     x="Status",
#     color="Gender",
#     box=True,
#     points="all",
#     hover_data=dtf.columns,
# )

fig = Figure()

fig.add_trace(
    Violin(
        x=dtf["Status"][dtf["Gender"] == "Male"],
        y=dtf["LifeDuration"][dtf["Gender"] == "Male"],
        # legendgroup="Gender",
        # scalegroup="Yes",
        name="Male",
        side="negative",
        line_color="blue",
        box_visible=True,
    )
)

fig.add_trace(
    Violin(
        x=dtf["Status"][dtf["Gender"] == "Female"],
        y=dtf["LifeDuration"][dtf["Gender"] == "Female"],
        # legendgroup="Gender",
        # scalegroup="Yes",
        name="Female",
        side="positive",
        line_color="pink",
    )
)

fig.update_yaxes(range=[0, 100], title="Age (years)")
fig.update_traces(meanline_visible=True, box_visible=True, points="all")
fig.update_layout(
    violingap=0, violinmode="overlay", title_text="Life duration expectation"
)

if exists("../private"):
    fig.write_html("../private/FamilyLiveExpectationViolin.html")

fig

In [None]:
dtf2 = (
    (
        dtf[["BirthCity"]]
        .groupby(["BirthCity"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)
# print(dtf2)
# dtf2
# fig = bar(dtf2, y="Count", x="BirthPlace", title="My family tree origins")
fig = bar(dtf2, y="Count", x="BirthCity", title="My family tree origins")

if exists("../private"):
    fig.write_html("../private/OriginHistogram.html")

fig

In [None]:
dtf3 = (
    (
        dtf[["BirthCity", "BirthRegion", "BirthCountry"]]
        # .fillna("Unknown")
        .groupby(["BirthCity", "BirthRegion", "BirthCountry"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)
# print(dtf2)
# dtf2
# fig = bar(dtf2, y="Count", x="BirthPlace", title="My family tree origins")
fig = sunburst(
    dtf3,
    path=["BirthCountry", "BirthRegion", "BirthCity"],
    values="Count",
    title="My family tree origins",
)
fig.update_traces(textinfo="label+percent entry")

if exists("../private"):
    fig.write_html("../private/OriginSunburst.html")

fig

In [None]:
dtf3 = (
    (
        dtf[["FamilyName", "BirthCity"]]
        .groupby(["FamilyName", "BirthCity"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)

# Slavic name regroup:
dtf3.FamilyName = dtf3.FamilyName.str.replace("V$", "VI", regex=True)
dtf3.FamilyName = dtf3.FamilyName.str.replace("VA$", "VI", regex=True)
dtf3.FamilyName = dtf3.FamilyName.str.replace("SKA$", "SKI", regex=True)

fig = sunburst(
    dtf3,
    path=["FamilyName", "BirthCity"],
    values="Count",
    title="My family tree origins",
)
fig.update_traces(textinfo="label+percent entry")

if exists("../private"):
    fig.write_html("../private/FamilyNameSunburst.html")

fig


In [None]:
dtf3 = (
    (
        dtf[["BirthMonth", "BirthDay"]]
        .groupby(["BirthMonth", "BirthDay"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)

fig = sunburst(
    dtf3,
    path=["BirthMonth", "BirthDay"],
    values="Count",
    title="Family bitrth month",
)
fig.update_traces(textinfo="label+percent entry")

if exists("../private"):
    fig.write_html("../private/BirthMonthSunburst.html")

fig


In [None]:
dtf3 = (
    (
        dtf[["BirthDay", "BirthMonth"]]
        .groupby(["BirthDay", "BirthMonth"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)

fig = sunburst(
    dtf3,
    path=["BirthDay", "BirthMonth"],
    values="Count",
    title="Family birth day",
)
fig.update_traces(textinfo="label+percent entry")

if exists("../private"):
    fig.write_html("../private/BirthDaySunburst.html")

fig
