# Analyse a gedcom file
## 1. Install

In [None]:
# %pip install -U fastgedcom
%pip list | grep fastgedcom

## 2. Import Python functions

In [None]:
from fastgedcom.family_aid import FamilyAid
from fastgedcom.base import FakeLine, is_true
from fastgedcom.parser import guess_encoding, parse
from fastgedcom.helpers import (
    extract_int_year,
    extract_year,
    format_name
)
from plotly.express import histogram, violin
from pandas import DataFrame, to_datetime
import plotly.io as pio
from numpy import timedelta64
from datetime import datetime


## 3. Input Gedcom file

In [None]:
GEDCOM_FILE = "/Users/vv/Downloads/VESSELINOV Family Tree-8.ged"

## 4. Load raw Gedcom file as a Dataframe

In [None]:
pio.templates.default = "plotly_dark"

with open(GEDCOM_FILE, "r", encoding=guess_encoding(GEDCOM_FILE)) as f:
    document, warnings = parse(f)

if warnings:
    print("Warnings: ", *warnings, sep="\n", end="---\n")

# booster = FamilyAid(gedcom)
# print(booster)
rows = []

for individual in document.get_records("INDI"):
    name = (document[individual.tag] > "NAME") >= "GIVN"
    surname = (document[individual.tag] > "NAME") >= "SURN"
    birth_date = (document[individual.tag] > "BIRT") >= "DATE"
    birth_place = (document[individual.tag] > "BIRT") >= "PLAC"
    death = is_true(document[individual.tag] > "DEAT")

    death_date = (document[individual.tag] > "DEAT") >= "DATE"

    rows.append(
        {
            "FirstName": name,
            "FamilyName": surname.upper(),
            "Birthday": birth_date,
            "Birthplace": birth_place,
            "Deathday": death_date,
            "Dead": death,
        }
    )

dtf = DataFrame.from_records(rows)
dtf


## 5. Add extra computations of the raw data model

In [None]:
dtf["BirthDT"] = to_datetime(dtf.Birthday, errors="coerce", format=r"%d %b %Y")
dtf["BirthDT2"] = to_datetime(dtf.Birthday, errors="coerce", format=r"%Y")
dtf.BirthDT = dtf.BirthDT.fillna(dtf.BirthDT2)

dtf["DeathDT"] = to_datetime(dtf.Deathday, errors="coerce", format=r"%d %b %Y")

dtf["DeathDT2"] = to_datetime(dtf.Deathday, errors="coerce", format=r"%Y")
dtf.DeathDT = dtf.DeathDT.fillna(dtf.DeathDT2)
dtf.loc[dtf.Dead == False, "DeathDT"] = dtf.DeathDT.fillna(datetime.today())
dtf["LiveDuration"] = (dtf.DeathDT - dtf.BirthDT) / timedelta64(1, "Y")

dtf["Name"] = dtf.FirstName + " " + dtf.FamilyName
dtf

## 6. Plot live expectation

In [None]:
fig = histogram(dtf.sort_values("LiveDuration", ascending=False), y="LiveDuration", x="Name", color="Dead")
fig.write_html("FamilyLiveExpectationHistogram.html")
fig


In [None]:
fig = violin(dtf, y="LiveDuration", color="Dead")
fig.write_html("FamilyLiveExpectationViolin.html")
fig


In [None]:
dtf2 = (
    (
        dtf[["Birthplace"]]
        .groupby(["Birthplace"])
        .size()
        .to_frame("Count")
        .sort_values("Count", ascending=False)
    )
    .reset_index(drop=False)
    .replace("", "Unknown")
)
# print(dtf2)
# dtf2
fig = histogram(dtf2, y="Count", x="Birthplace", title="My family tree origins")
fig.write_html("OriginHistogram.html")
fig
