In [1]:
%load_ext autoreload
%autoreload 2

# Imports and Constants

In [15]:
import pandas as pd
import random
import time
import datetime
import os
import re
import numpy as np
import uuid

In [3]:
pd.options.display.max_rows = 500

In [4]:
letters = "q w e r t y u i o p a s d f g h j k l z x c v b n m".split()

## Only need to do this once

### Combine baby name files into a single name file

Names taken from us census data baby names - https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-data

In [59]:
file_names = os.listdir("../data/inputs/names/")
df_name_list = []
for fl in file_names:
    df_tmp = pd.read_csv(os.path.join("../data/inputs/names", fl), header=None)
    df_tmp.columns = ["name", "sex", "count"]
    df_tmp["year"] = int(re.search("\d+", fl)[0])
    df_tmp = df_tmp.loc[df_tmp["count"] >= 100]
    df_name_list.append(df_tmp)
df_first_names = pd.concat(df_name_list)
df_first_names["name"] = df_first_names["name"].str.lower()
df_first_names["sex"] = df_first_names["sex"].str.lower()

In [61]:
df_first_names.to_csv("../data/inputs/first_names.csv", index=False)

## Load Names

In [4]:
df_first_names = pd.read_csv("../data/inputs/first_names.csv")
df_first_names.head()

Unnamed: 0,name,sex,count,year
0,mary,f,56214,1940
1,barbara,f,36734,1940
2,patricia,f,32667,1940
3,judith,f,22385,1940
4,betty,f,22076,1940


In [5]:
# Pulled from https://genealogy.org.il/pop/
df_hebrew_names = pd.read_csv("../data/inputs/israeli_first_names.csv")
df_hebrew_names.head()

Unnamed: 0,name,first_name_count,middle_name_count,sex
0,דוד,90557,37681,m
1,יוסף,83090,33602,m
2,משה,79034,31286,m
3,אברהם,71590,22834,m
4,יעקב,66878,25450,m


In [6]:
# Pulled from https://en.wikipedia.org/wiki/Category:Surnames_of_Jewish_origin
df_jewish_surnames = pd.read_csv("../data/inputs/jewish_surnames.csv")
df_jewish_surnames.head()

Unnamed: 0,surname
0,aaron
1,aarons
2,aaronson
3,abraham
4,abramczyk


# Functions

In [7]:
def generate_person(birth_year: int = None, gender: str = None, sex: str = None):
    if gender is None:
        gender = random.sample(
            ["male", "female", "nonbinary", "other"], k=1, counts=[10, 10, 1, 1]
        )[0]
    elif gender not in ["male", "female", "nonbinary", "other"]:
        raise ValueError("gender must be one of male, female, nonbinary, or other")

    if sex is None:
        if gender == "male":
            sex = random.sample(["male", "female"], k=1, counts=[20, 1])[0]
        elif gender == "female":
            sex = random.sample(["female", "male"], k=1, counts=[20, 1])[0]
        else:
            sex = random.sample(["female", "male"], k=1, counts=[1, 1])[0]
    elif sex not in ["male", "female", "intersex"]:
        raise ValueError("sex must be one of male, female, or intersex")

    if birth_year is None:
        birth_year = random.sample(
            population=range(2025, 1880, -1),
            k=1,
            counts=[x for x in range(1, 2025 - 1880 + 1)],
        )[0]
    birth_datetime = (
        datetime.datetime.strptime(str(birth_year), "%Y")
        + datetime.timedelta(days=random.randint(0, 365))
    ).date()
    # formula based off of life expectancies from CDC (https://healthdata.gov/dataset/NCHS-Death-rates-and-life-expectancy-at-birth/4r8i-dqgb/about_data)
    life_expectancy = int(521.64 * np.log(birth_year) - 3887)
    lifespan = (
        life_expectancy + int(random.normalvariate(0, 15))
    ) * 365 + random.randint(1, 365)
    death_datetime = birth_datetime + datetime.timedelta(days=lifespan)

    if gender == "male":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
            & (df_first_names["sex"] == "m")
        ]
        df_hebrew_names_small = df_hebrew_names.loc[df_hebrew_names["sex"] == "m"]
    elif gender == "female":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
            & (df_first_names["sex"] == "f")
        ]
        df_hebrew_names_small = df_hebrew_names.loc[df_hebrew_names["sex"] == "f"]
    elif gender == "nonbinary":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
        ]
        df_hebrew_names_small = df_hebrew_names
    else:
        df_first_names_small = df_first_names
        df_hebrew_names_small = df_hebrew_names
    names = random.sample(
        df_first_names_small["name"].tolist(),
        k=5,
        counts=df_first_names_small["count"].tolist(),
    )
    first_name = names[0]
    middle_names_count = random.sample(range(3), 1)[0]
    middle_name = " ".join(names[1 : 1 + middle_names_count])
    hebrew_name_list = random.sample(
        df_hebrew_names_small["name"].tolist(),
        k=3,
        counts=df_hebrew_names_small["first_name_count"].tolist(),
    )
    non_english_name = " ".join(hebrew_name_list[0 : 1 + middle_names_count])
    last_name = df_jewish_surnames["surname"].sample().iloc[0]

    person_dict = {
        "first_name": first_name,
        "middle_name": middle_name,
        "last_name": last_name,
        "maiden_name": last_name,
        "preferred_name": first_name,
        "nickname": first_name,
        "non_english_name": non_english_name,
        "date_of_birth": birth_datetime,
        #         "date_of_adoption": adoption_date,
        "date_of_death": death_datetime,
        "gender": gender,
        "sex": sex,
    }
    return person_dict

In [8]:
def generate_pet(species: str = None, sex: str = None, birth_year: int = None):
    if species is None:
        species = random.sample(
            ["dog", "cat", "fish", "lizard", "bird"], k=1, counts=[15, 8, 3, 1, 1]
        )[0]
    elif species not in ["dog", "cat", "fish", "lizard", "bird"]:
        raise ValueError("species must be one of dog, cat, fish, lizard, or bird")

    if sex is None:
        sex = random.sample(["male", "female", "unknown"], k=1, counts=[10, 10, 1])[0]
    elif gender not in ["male", "female", "unknown"]:
        raise ValueError("gender must be one of male, female, or unknown")

    if birth_year is None:
        birth_year = random.sample(
            population=range(2025, 1880, -1),
            k=1,
            counts=[x for x in range(1, 2025 - 1880 + 1)],
        )[0]
    birth_datetime = (
        datetime.datetime.strptime(str(birth_year), "%Y")
        + datetime.timedelta(days=random.randint(0, 365))
    ).date()

    was_adopted = random.sample(population=[True, False], k=1, counts=[10, 1])[0]
    if was_adopted:
        adopted_datetime = birth_datetime + datetime.timedelta(
            days=random.randint(0, 365 * 4)
        )
        adoption_date = adopted_datetime.isoformat()
    else:
        adoption_date = None

    life_expectancy_dict = {"dog": 12, "cat": 16, "fish": 5, "lizard": 15, "bird": 20}
    life_expectancy = life_expectancy_dict[species]
    lifespan = (
        life_expectancy + int(random.normalvariate(0, life_expectancy / 5))
    ) * 365 + random.randint(1, 365)
    death_datetime = birth_datetime + datetime.timedelta(days=lifespan)
    if death_datetime <= datetime.date.today():
        death_date = death_datetime.isoformat()
    else:
        death_date = None

    if sex == "male":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
            & (df_first_names["sex"] == "m")
        ]
    elif sex == "female":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
            & (df_first_names["sex"] == "f")
        ]
    elif sex == "unknown":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
        ]
    else:
        df_first_names_small = df_first_names
    names = random.sample(
        df_first_names_small["name"].tolist(),
        k=5,
        counts=df_first_names_small["count"].tolist(),
    )
    first_name = names[0]
    middle_names_count = random.sample(population=range(3), k=1, counts=[5, 2, 1])[0]
    middle_name = " ".join(names[1 : 1 + middle_names_count])

    pet_dict = {
        "species": species,
        "first_name": first_name,
        "middle_name": middle_name,
        "date_of_birth": birth_datetime.isoformat(),
        "date_of_adoption": adoption_date,
        "date_of_death": death_date,
        "sex": sex,
    }
    return pet_dict

In [9]:
def generate_relationship(
    relationship_type: str = None, date_start: str = None, date_end: str = None
):
    if relationship_type is None:
        relationship_type = random.sample(
            ["parent_child", "partnership", "owner_pet"],
            k=1,
            counts=[20, 20, 0],
        )[0]
    elif relationship_type not in ["parent_child", "partnership", "owner_pet"]:
        raise ValueError(
            "relationship_type must be one of parent_child, partnership, or owner_pet"
        )
    relationship_subtype = None
    start_datetime = None
    end_datetime = None
    if relationship_type == "partnership":
        relationship_subtype = random.sample(
            ["marriage", "partnered"], k=1, counts=[10, 1]
        )[0]
        if date_start is None:
            start_year = random.sample(
                population=range(2025, 1900, -1),
                k=1,
                counts=[x for x in range(1, 2025 - 1900 + 1)],
            )[0]
        start_datetime = (
            datetime.datetime.strptime(str(start_year), "%Y")
            + datetime.timedelta(days=random.randint(0, 365))
        ).date()

        duration = random.sample(population=range(5 * 365, 70 * 365), k=1)[0]

        end_datetime = start_datetime + datetime.timedelta(days=duration)

    relationship_dict = {
        "type": relationship_type,
        "subtype":relationship_subtype,
        "start_date": start_datetime,
        "end_date": end_datetime,
    }
    return relationship_dict

In [10]:
def find_partnerships(
    df_people: pd.DataFrame, df_relationships: pd.DataFrame, relationships_people: list
):
    for idx, val in df_relationships.loc[
        df_relationships["type"] == "partnership"
    ].iterrows():
        # Let's put bounds that they get married sometime between the age of 19 and 70
        starter_birth_date_upper = val.start_date - datetime.timedelta(days=19 * 365)
        starter_birth_date_lower = val.start_date - datetime.timedelta(days=70 * 365)
        eligible_starter = df_people.loc[
            (df_people["date_of_birth"] <= starter_birth_date_upper)
            & (df_people["date_of_birth"] >= starter_birth_date_lower)
        ]
        eligible_starter_ids = eligible_starter["id"].tolist()
        # remove people who are already in relationships
        ineligible_ids = []
        if len(relationships_people) > 0:
            df_ineligible = pd.DataFrame(relationships_people)
            df_ineligible = df_ineligible.merge(
                df_relationships, how="left", left_on="relationship_id", right_on="id"
            )
            ineligible_ids = df_ineligible.loc[
                (
                    (df_ineligible["start_date"] <= val.start_date)
                    & (df_ineligible["end_date"] >= val.start_date)
                )
                | (
                    (df_ineligible["start_date"] >= val.start_date)
                    & (df_ineligible["start_date"] <= val.end_date)
                ),
                "people_id",
            ].tolist()
#             if len(ineligible_ids) > 0:
#                 print(f"ineligible ids for relationship {val.id} are {ineligible_ids}")
        eligible_starter_ids = [
            x for x in eligible_starter_ids if x not in ineligible_ids
        ]
        random.shuffle(eligible_starter_ids)
        # Keep trying to find a set of working partners

        for partner_1_id in eligible_starter_ids:
            partner_1 = df_people.loc[df_people["id"] == partner_1_id].to_dict(
                orient="records"
            )[0]
            # Let's put bounds on the partner being born within 10 years
            birth_range_lower = partner_1["date_of_birth"] - datetime.timedelta(
                days=10 * 365
            )
            birth_range_upper = partner_1["date_of_birth"] + datetime.timedelta(
                days=10 * 365
            )
            eligible_partners = df_people.loc[
                (df_people["date_of_birth"] <= birth_range_upper)
                & (df_people["date_of_birth"] >= birth_range_lower)
                #                 & (df_people["date_of_death"] > val.end_date)
                & (df_people["id"] != partner_1_id)
                & (~df_people["id"].isin(ineligible_ids))
            ]

            if eligible_partners.shape[0] > 0:
                partner_2_id = random.sample(eligible_partners["id"].tolist(), k=1)[0]
                partner_2 = df_people.loc[df_people["id"] == partner_2_id].to_dict(
                    orient="records"
                )[0]
                partner_ids = [partner_1_id, partner_2_id]

                for partner_id in partner_ids:
                    relationships_people += [
                        {
                            "relationship_id": val.id,
                            "people_id": partner_id,
                            "title": "partner",
                        }
                    ]
                break
    return relationships_people

In [11]:
def find_children(
    df_people: pd.DataFrame, df_relationships: pd.DataFrame, relationships_people: list
):
    df_relationships_people = pd.DataFrame(relationships_people)
    # Associate each parent_child relationship to a partnership
    partnership_ids = df_relationships.loc[
        df_relationships["type"] == "partnership", "id"
    ].tolist()
    parent_child_ids = df_relationships.loc[
        df_relationships["type"] == "parent_child", "id"
    ].tolist()
    partner_parent_zip = list(zip(parent_child_ids, partnership_ids))

    for ppz in partner_parent_zip:
        parent_child_id = ppz[0]
        partnership_id = ppz[1]

        # Associate parents to parent_child relationship
        parent_ids = df_relationships_people.loc[
            df_relationships_people["relationship_id"] == partnership_id, "people_id"
        ].tolist()
        for parent in parent_ids:
            relationships_people += [
                {
                    "relationship_id": parent_child_id,
                    "people_id": parent,
                    "title": "parent",
                }
            ]
        partnership_start = df_relationships.loc[
            df_relationships["id"] == partnership_id, "start_date"
        ].iloc[0]
        partnership_end = df_relationships.loc[
            df_relationships["id"] == partnership_id, "end_date"
        ].iloc[0]
        possible_children_ids = df_people.loc[
            (~df_people["id"].isin(parent_ids))
            & (df_people["date_of_birth"] > partnership_start)
            & (df_people["date_of_birth"] < partnership_end),
            "id",
        ].tolist()
        # Prevent sibling relationships
        for partnership in partnership_ids:
            id_list = df_relationships_people.loc[
                df_relationships_people["relationship_id"] == partnership, "people_id"
            ].tolist()
            if len(possible_children_ids) > 0 and id_list[0] in possible_children_ids:
                possible_children_ids = [
                    x
                    for x in possible_children_ids
                    if x not in random.sample(id_list, k=len(id_list) - 1)
                ]

        children_count = min(random.randint(1, 8), len(possible_children_ids))
        children_ids = random.sample(population=possible_children_ids, k=children_count)
        # Use the first male/nonbinary gendered last name in the relationship as all of the children's last name
        last_name = (
            df_people.loc[df_people["id"].isin(parent_ids)]
            .sort_values(["gender"], ascending=False)["last_name"]
            .iloc[0]
        )
        for child_id in children_ids:
            relationships_people += [
                {
                    "relationship_id": parent_child_id,
                    "people_id": child_id,
                    "title": "child",
                }
            ]
            df_people.loc[df_people["id"] == child_id, "last_name"] = last_name
            df_people.loc[df_people["id"] == child_id, "maiden_name"] = last_name
    return relationships_people, df_people

In [12]:
def generate_tree(
    people: pd.DataFrame = None,
    relationships: pd.DataFrame = None,
    people_count: int = 0,
    generations: int = 0,
):
    if people is None:
        if people_count == 0:
            people_count = 100
        df_people = pd.DataFrame([generate_person() for _ in range(people_count)])
        df_people = df_people.reset_index(names="id")
    if relationships is None:
        relationship_count = int(people_count / 10)
        df_relationships = pd.DataFrame(
            [generate_relationship() for _ in range(relationship_count)]
        )
        df_relationships = df_relationships.reset_index(names="id")

    # Create partnerships
    relationships_people = []
    relationships_people = find_partnerships(
        df_people=df_people,
        df_relationships=df_relationships,
        relationships_people=relationships_people,
    )

    df_relationships_people = pd.DataFrame(relationships_people)

    # Drop partnerships with zero matches
    parternship_ids = df_relationships.loc[
        df_relationships["type"] == "partnership", "id"
    ].tolist()
    match_ids = df_relationships_people["relationship_id"].unique().tolist()
    zero_matches = list(set(parternship_ids).difference(set(match_ids)))
    df_relationships = df_relationships.loc[~df_relationships["id"].isin(zero_matches)]

    # Create children
    if len(relationships_people) > 0:
        relationships_people, df_people = find_children(
            df_people=df_people,
            df_relationships=df_relationships,
            relationships_people=relationships_people,
        )
    df_relationships_people = pd.DataFrame(relationships_people)

    parent_children_ids = df_relationships_people.loc[
        df_relationships_people["title"].isin(["parent", "child"]), "people_id"
    ].tolist()

    # Change last name to match that of partner
    # sort by start date so that you change the parents name before you change the childs name
    partnership_ids = df_relationships.loc[
        df_relationships["type"] == "partnership", "id"
    ].tolist()
#     df_relationships.loc[
#         df_relationships["type"] == "partnership"
#     ].sort_values("start_date")["id"].tolist()
    for partnership in partnership_ids:
        partner_ids = df_relationships_people.loc[
            df_relationships_people["relationship_id"] == partnership, "people_id"
        ].tolist()
        last_name = (
            df_people.loc[df_people["id"].isin(partner_ids)]
            .sort_values(["gender"], ascending=False)["last_name"]
            .iloc[0]
        )
        for partner_id in partner_ids:
            df_people.loc[df_people["id"] == partner_id, "last_name"] = last_name
        # check if connected to family tree
        ineligible_list = [x for x in partner_ids if x not in parent_children_ids]
        if len(ineligible_list) == len(partner_ids):
            df_people =  df_people.loc[~df_people["id"].isin(partner_ids)]
            df_relationships_people =  df_relationships_people.loc[~df_relationships_people["people_id"].isin(partner_ids)]
            df_relationships =  df_relationships.loc[~df_relationships["id"].isin([partnership])]

    # Drop everyone not part of the family tree

#     partner_ids = df_relationships_people.loc[
#         df_relationships_people["title"].isin(["partner"]), "people_id"
#     ].tolist()
#     # check if partner is parent or child
#     eligibile_partners = [x for x in partner_ids if x in parent_children_ids]
#     # for partners which are not parents or children check if partners or parents or children (due to second partnership)
#     possible_ineligible = [x for x in partner_ids if x not in parent_children_ids]
#     if len(possible_ineligible) > 0:
#         print("possible_ineligible")
#         print(possible_ineligible)

    #     partner_of_eligible = [x for x in possible_ineligible if x in ]

    family_ids = df_relationships_people["people_id"].tolist()
    relationship_ids = df_relationships_people["relationship_id"].tolist()
    df_people = df_people.loc[df_people["id"].isin(family_ids)]
    df_relationships = df_relationships.loc[
        df_relationships["id"].isin(relationship_ids)
    ]

    return df_people, df_relationships, df_relationships_people

In [169]:
df_relationships.loc[
        df_relationships["type"] == "partnership"
    ].sort_values("start_date")

Unnamed: 0,id,type,subtype,start_date,end_date
8,8,partnership,marriage,1917-04-02,1937-02-18
1,1,partnership,marriage,1929-01-22,1972-07-29
6,6,partnership,marriage,1934-06-28,1983-11-06
3,3,partnership,partnered,1941-05-19,1958-10-24
4,4,partnership,marriage,1968-09-09,2009-07-23


In [161]:
def convert_format(df_people, df_relationships, df_relationships_people):
    # Convert the data into the json format used by the family chart visualization

    # give all people UUIDs
#     df_people["uuid"] = df_people["id"].apply(lambda x: uuid.uuid4())

    parent_ids = df_relationships_people.loc[
        df_relationships_people["title"] == "parent", "people_id"
    ].tolist()
    children_ids = df_relationships_people.loc[
        df_relationships_people["title"] == "child", "people_id"
    ].tolist()
    partner_ids = df_relationships_people.loc[
        df_relationships_people["title"] == "partner", "people_id"
    ].tolist()
    data_output = []
    for _, people in df_people.iterrows():
        print(people["id"])
        if people["gender"] not in ["male", "female"]:
            gender = random.sample(["M", "F"], k=1)[0]
        else:
            gender = people["gender"][0].upper()
        df_tmp = {
            "id": str(people["id"]),
            #             "id": str(people["uuid"]),
            "rels": {},
            #             "rels": {
            #                 "spouses": partners_uuids,
            #                 "children": children_uuids,
            #                 "mother": mother_uuid,
            #                 "father": father_uuid,
            #             },
            "data": {
                "gender": gender,
                "first name": people["first_name"],
                "last name": people["last_name"],
                "birthday": people["date_of_birth"].isoformat(),
                "avatar": "",
            },
        }

        if people["id"] in parent_ids:
            relationship_id = df_relationships_people.loc[
                (df_relationships_people["people_id"] == people["id"])
                & (df_relationships_people["title"] == "parent"),
                "relationship_id",
            ].tolist()[0]
            children = df_relationships_people.loc[
                (df_relationships_people["relationship_id"] == relationship_id)
                & (df_relationships_people["title"] == "child"),
                "people_id",
            ].tolist()
            #             children_uuids = df_people.loc[
            #                 df_people["id"].isin(children), "uuid"
            #             ].tolist()
            #             children_uuids = [str(x) for x in children_uuids]
            df_tmp["rels"]["children"] = [str(x) for x in children]

        if people["id"] in children_ids:
            relationship_id = df_relationships_people.loc[
                (df_relationships_people["people_id"] == people["id"])
                & (df_relationships_people["title"] == "child"),
                "relationship_id",
            ].tolist()[0]
            parents = df_relationships_people.loc[
                (df_relationships_people["relationship_id"] == relationship_id)
                & (df_relationships_people["title"] == "parent"),
                "people_id",
            ].tolist()
            # Current versin of visualization requires one parent to be the father and one to be the mother, so using the one who changed their last name to be the mother
            #             mother_uuid = df_people.loc[
            #                 (df_people["id"].isin(parents))
            #                 & (df_people["last_name"] != df_people["maiden_name"]),
            #                 "uuid",
            #             ].tolist()[0]
            #             mother_uuid = str(mother_uuid)
            #             father_uuid = df_people.loc[
            #                 (df_people["id"].isin(parent_ids))
            #                 & (df_people["last_name"] == df_people["maiden_name"]),
            #                 "uuid",
            #             ].tolist()[0]
            #             father_uuid = str(father_uuid)
            mother_id = df_people.loc[
                (df_people["id"].isin(parents))
                & (df_people["last_name"] != df_people["maiden_name"]),
                "id",
            ].tolist()[0]
            father_id = df_people.loc[
                (df_people["id"].isin(parents))
                & (df_people["last_name"] == df_people["maiden_name"]),
                "id",
            ].tolist()[0]
            df_tmp["rels"]["mother"] = str(mother_id)
            df_tmp["rels"]["father"] = str(father_id)

        if people["id"] in partner_ids:
            relationship_id = df_relationships_people.loc[
                (df_relationships_people["people_id"] == people["id"])
                & (df_relationships_people["title"] == "partner"),
                "relationship_id",
            ].tolist()[0]
            partners = df_relationships_people.loc[
                (df_relationships_people["relationship_id"] == relationship_id)
                & (df_relationships_people["title"] == "partner")
                & (df_relationships_people["people_id"] != people["id"]),
                "people_id",
            ].tolist()
#             partners_uuids = df_people.loc[
#                 df_people["id"].isin(partners), "uuid"
#             ].tolist()
#             partners_ids = [str(x) for x in partners]
            df_tmp["rels"]["spouses"] = [str(x) for x in partners]

        data_output.append(df_tmp)
    return data_output

In [162]:
parent_ids = df_relationships_people.loc[
    df_relationships_people["title"] == "parent", "people_id"
].tolist()
children_ids = df_relationships_people.loc[
    df_relationships_people["title"] == "child", "people_id"
].tolist()
partner_ids = df_relationships_people.loc[
    df_relationships_people["title"] == "partner", "people_id"
].tolist()

In [166]:
df_relationships_people.loc[
                (df_relationships_people["relationship_id"] == 2)
                & (df_relationships_people["title"] == "parent"),
                "people_id",
            ].tolist()

[88, 10]

In [172]:
df_people.loc[df_people["id"].isin([88,10])]

Unnamed: 0,id,first_name,middle_name,last_name,maiden_name,preferred_name,nickname,non_english_name,date_of_birth,date_of_death,gender,sex,uuid
10,10,evelyn,antoinette,schorr,kantorov,evelyn,evelyn,גאיה ניצן,1914-06-04,1996-05-14,female,female,8623b31e-3b0f-42d9-badd-27ffda73fbad
88,88,olive,,kantorov,schechter,olive,olive,גלית,1920-05-22,1964-11-27,female,female,de9ce9e8-35df-4eef-8409-d6c88982349d


In [173]:
df_relationships_people

Unnamed: 0,relationship_id,people_id,title
0,1,81,partner
1,1,25,partner
2,3,88,partner
3,3,10,partner
4,4,10,partner
5,4,61,partner
6,6,94,partner
7,6,19,partner
8,8,91,partner
9,8,13,partner


In [160]:
data_output = convert_format(df_people, df_relationships, df_relationships_people)

0
6
7
10
13
17
19
22
25
27
40
41
42
45


IndexError: list index out of range

In [152]:
data_output

[{'id': '6',
  'rels': {'children': ['38'], 'spouses': ['91']},
  'data': {'gender': 'F',
   'first name': 'lillie',
   'last name': 'thalmann',
   'birthday': '1884-10-29',
   'avatar': ''}},
 {'id': '13',
  'rels': {'children': ['18'], 'spouses': ['27']},
  'data': {'gender': 'F',
   'first name': 'mary',
   'last name': 'mayzel',
   'birthday': '1883-02-26',
   'avatar': ''}},
 {'id': '18',
  'rels': {'mother': '13', 'father': '27'},
  'data': {'gender': 'M',
   'first name': 'david',
   'last name': 'mayzel',
   'birthday': '1945-08-01',
   'avatar': ''}},
 {'id': '21',
  'rels': {'children': ['79', '75'], 'spouses': ['69']},
  'data': {'gender': 'M',
   'first name': 'john',
   'last name': 'lotman',
   'birthday': '1893-12-07',
   'avatar': ''}},
 {'id': '27',
  'rels': {'children': ['18'], 'spouses': ['13']},
  'data': {'gender': 'M',
   'first name': 'lester',
   'last name': 'mayzel',
   'birthday': '1891-01-03',
   'avatar': ''}},
 {'id': '28',
  'rels': {'mother': '75', 'fat

In [153]:
with open(f"../data/outputs/data_output_{str(int(datetime.datetime.now().timestamp()))}.json","w") as fp:
    json.dump(data_output, fp, indent=4)

In [87]:
df_relationships_people.loc[
        df_relationships_people["title"] == "parent", "people_id"
    ].tolist()

[21, 69, 13, 27, 90, 75, 91, 6]

In [156]:
df_relationships_people


Unnamed: 0,relationship_id,people_id,title
0,1,21,partner
1,1,69,partner
2,3,13,partner
3,3,27,partner
4,6,90,partner
5,6,75,partner
6,7,91,partner
7,7,6,partner
8,0,21,parent
9,0,69,parent


In [38]:
df_people.head()

Unnamed: 0,id,first_name,middle_name,last_name,maiden_name,preferred_name,nickname,non_english_name,date_of_birth,date_of_death,gender,sex
6,6,lillie,frances laura,thalmann,zinkin,lillie,lillie,חיה אריאל מאיה,1884-10-29,1958-11-29,female,female
13,13,mary,grace anna,mayzel,hirsch,mary,mary,רחל אסתר אופיר,1883-02-26,1922-10-24,female,female
18,18,david,michael,mayzel,mayzel,david,david,גיא יוסף,1945-08-01,2014-03-10,male,male
21,21,john,herman edward,lotman,lotman,john,john,חנוך אבי ינאי,1893-12-07,1975-10-22,male,male
27,27,lester,robert frank,mayzel,mayzel,lester,lester,משה אלכסנדר דניאל,1891-01-03,1941-07-23,male,male


In [72]:
df_relationships_people.loc[(df_relationships_people["people_id"]==27) & (df_relationships_people["title"]=="partner"), "relationship_id"].tolist()[0]

3

In [157]:
df_people, df_relationships, df_relationships_people = generate_tree()
# display(df_people.head())
display(df_relationships.head())
display(df_relationships_people.head())

Unnamed: 0,id,type,subtype,start_date,end_date
0,0,parent_child,,,
1,1,partnership,marriage,1929-01-22,1972-07-29
2,2,parent_child,,,
3,3,partnership,partnered,1941-05-19,1958-10-24
4,4,partnership,marriage,1968-09-09,2009-07-23


Unnamed: 0,relationship_id,people_id,title
0,1,81,partner
1,1,25,partner
2,3,88,partner
3,3,10,partner
4,4,10,partner


# Generate Family Tree Data

## People

|name|type|  
|---|---|  
|id|INTEGER|
|type|TEXT|
|pet_type|TEXT|
|first_name|TEXT|
|middle_name|TEXT|
|last_name|TEXT|
|preferred_name|TEXT|
|nickname|TEXT|
|hebrew_name|TEXT|
|date_of_birth|TEXT|
|date_of_adoption|TEXT|
|date_of_death|TEXT|
|gender|TEXT|

In [194]:
fake.name

False

In [107]:
people_list = []
birth_days_ago = random.sample(
    range(365 * 120, 180, -1),
    k=1000,
    counts=[2 * x for x in range(1, 365 * 120 - 180 + 1)],
)
for i in range(100):
    person_type = random.sample(["human", "pet"], k=1, counts=[20, 1])[0]
    #     birth_days_ago = random.randint(180, 125 * 365)
    birth_datetime = datetime.date.today() - datetime.timedelta(days=birth_days_ago[i])

    if person_type == "pet":
        pet_type = random.sample(
            ["dog", "cat", "fish", "lizard", "bird"], k=1, counts=[15, 8, 3, 1, 1]
        )[0]
        lifespan = random.randint(4 * 365, 20 * 365)
        adoption_datetime = birth_datetime + datetime.timedelta(
            days=random.randint(0, 3 * 365)
        )
        adoption_date = adoption_datetime.isoformat()
        gender = random.sample(["male", "female", "unknown"], k=1, counts=[20, 20, 1])[0]
    else:
        pet_type = None
        lifespan = int(random.normalvariate(80, 15)) * 365 + random.randint(1, 365)
        adoption_date = None
        gender = random.sample(
            ["male", "female", "nonbinary", "other"], k=1, counts=[10, 10, 1, 1]
        )[0]

    birth_date = birth_datetime.isoformat()
    death_datetime = birth_datetime + datetime.timedelta(days=lifespan)
    if death_datetime <= datetime.date.today():
        death_date = death_datetime.isoformat()
    else:
        death_date = None
    
    if gender == "male":
        df_first_names_small = df_first_names.loc[(df_first_names["year"]==birth_datetime.year) & (df_first_names["sex"]=="m")]
        df_hebrew_names_small = df_hebrew_names.loc[df_hebrew_names["sex"]=="m"]
    elif gender == "female":
        df_first_names_small = df_first_names.loc[(df_first_names["year"]==birth_datetime.year) & (df_first_names["sex"]=="f")]
        df_hebrew_names_small = df_hebrew_names.loc[df_hebrew_names["sex"]=="f"]
    elif gender == "nonbinary":
        df_first_names_small = df_first_names.loc[(df_first_names["year"]==birth_datetime.year)]
        df_hebrew_names_small = df_hebrew_names
    else:
        df_first_names_small = df_first_names
        df_hebrew_names_small = df_hebrew_names
    names = random.sample(df_first_names_small["name"].tolist(), k=5, counts=df_first_names_small["count"].tolist())
    first_name = names[0]
    middle_names_count = random.sample(range(3),1)[0]
    middle_name = " ".join(names[1:1+middle_names_count])
    hebrew_name_list = random.sample(df_hebrew_names_small["name"].tolist(), k=3, counts=df_hebrew_names_small["first_name_count"].tolist())
    hebrew_names = " ".join(hebrew_name_list[0:1+middle_names_count])
    
    
    

    tmp_dict = {
        "id": i,
        "type": person_type,
        "pet_type": pet_type,
        "first_name": first_name,
        "middle_name": middle_name,
        "last_name": "".join(random.sample(letters, random.randint(4, 10))),
        "preferred_name": first_name,
        "nickname": first_name,
        "hebrew_name": hebrew_names,
        "date_of_birth": birth_date,
        "date_of_adoption": adoption_date,
        "date_of_death": death_date,
        "gender": gender,
    }
    people_list.append(tmp_dict)
df_people = pd.DataFrame(people_list)
df_people.head(20)

Unnamed: 0,id,type,pet_type,first_name,middle_name,last_name,preferred_name,nickname,hebrew_name,date_of_birth,date_of_adoption,date_of_death,gender
0,0,human,,hudson,,nwerdumzoy,hudson,hudson,שמעון,2012-12-05,,,male
1,1,human,,betty,,zsdxlrby,betty,betty,נעמי,1943-03-12,,2022-08-31,female
2,2,human,,isaac,isaac,sfmgzbt,isaac,isaac,משה אורי,2008-01-17,,,male
3,3,human,,carson,carlos sean,jqcvwlab,carson,carson,יוסף אייל טל,2003-06-09,,,male
4,4,pet,cat,sam,lorenzo leonidas,masupfyoh,sam,sam,משה יבגני נתנאל,2023-06-06,2026-03-23,,male
5,5,human,,amber,amanda sharla,cozvutrwsb,amber,amber,רינה דבורה רוזה,1980-01-08,,,female
6,6,human,,breanna,heather sarah,lcsufavn,breanna,breanna,מעיין כרמית טל,1987-10-06,,,female
7,7,human,,courtney,,hwfqysegkn,courtney,courtney,לידיה,2012-01-19,,,female
8,8,human,,muriel,elizabeth,byvjexml,muriel,muriel,סופיה הודיה,1936-06-24,,2017-07-04,female
9,9,human,,patrick,adela susan,mkilqpocb,patrick,patrick,ספיר מאור יגאל,2018-02-07,,,other


In [102]:
 random.sample(range(3),1)[0]

2

'joel elizabeth'

|name|type|  
|---|---|  
|id|INTEGER|
|type|TEXT|
|pet_type|TEXT|
|first_name|TEXT|
|middle_names|TEXT|
|last_name|TEXT|
|preferred_name|TEXT|
|hebrew_name|TEXT|
|date_of_birth|TEXT|
|date_of_adoption|TEXT|
|date_of_death|TEXT|
|gender|TEXT|

## Relationships

|name|type|  
|---|---|  
|id|INTEGER|
|type|TEXT|
|person_one_id|INTEGER|
|person_two_id|INTEGER|
|person_one_relationship|TEXT|
|person_two_relationship|TEXT|
|start_date|TEXT|
|end_date|TEXT|
|is_active|INTEGER|

In [310]:
pd.to_datetime(df_people["date_of_birth"].iloc[0]) - pd.Timedelta(4, unit="days")

Timestamp('1944-11-25 00:00:00')

In [355]:
x = (pd.to_datetime(df_people["date_of_birth"]) - pd.to_datetime(df_people["date_of_birth"].iloc[0])).apply(lambda x: abs(x.days))<10*365
y = x.loc[x==True].index.tolist()
# random.sample(x.loc[x==True].index.tolist(),1)

In [361]:
df_people.loc[df_people["id"] == i, "type"].iloc[0]

'human'

In [359]:
person_type

2    human
Name: type, dtype: object

In [368]:
person_one_id

19

In [412]:
x.date().isoformat()

'2012-07-15'

In [413]:
(
    (
        pd.to_datetime(
            df_people.loc[
                df_people["id"].isin([person_one_id, person_two_id]), "date_of_birth"
            ].max()
        )
        + pd.Timedelta(365 * 20, unit="day")
    )
).date().isoformat()

'2012-07-15'

In [452]:
relationships_list = []
for i in range(20):
    relationship_type = random.sample(
        ["marriage", "child_parent", "pet_parent", "partnership"],
        k=1,
        #         counts=[20, 20, 1, 1],
        counts=[2000, 20, 1, 1],
    )[0]
    person_type = df_people.loc[df_people["id"] == i, "type"].iloc[0]
    if relationship_type == "marriage" and person_type == "human":

        birth_range = (
            pd.to_datetime(df_people["date_of_birth"])
            - pd.to_datetime(df_people["date_of_birth"].iloc[i])
        ).apply(lambda x: abs(x.days)) < 10 * 365
        partner_index_list = birth_range.loc[birth_range == True].index.tolist()
        partner_index_list = [x for x in partner_index_list if x != i]
        if len(partner_index_list) > 0:
            partner_index = random.sample(partner_index_list, 1)[0]
            person_one_id = i
            person_two_id = partner_index
            person_one_relationship = "spouse"
            person_two_relationship = "spouse"
            younger_age = df_people.loc[
                df_people["id"].isin([person_one_id, person_two_id]),
                "date_of_birth",
            ].max()
            min_start_date = (
                ((pd.to_datetime(younger_age) + pd.Timedelta(365 * 19, unit="day")))
                .date()
                .isoformat()
            )
            first_death = (
                df_people.loc[
                    df_people["id"].isin([person_one_id, person_two_id]),
                    "date_of_death",
                ]
                .fillna("2099-01-01")
                .min()
            )
            if min_start_date < first_death:
                max_days = min(
                    (pd.to_datetime(first_death) - pd.to_datetime(min_start_date)).days,
                    40 * 365,
                )
                start_date = (
                    (
                        pd.to_datetime(min_start_date)
                        + pd.Timedelta(random.randint(0, max_days), unit="day")
                    )
                    .date()
                    .isoformat()
                )
                if start_date > datetime.date.today().isoformat():
                    start_date = None
            else:
                start_date = None

    #         pet_type = random.sample(
    #             ["dog", "cat", "fish", "lizard", "bird"], k=1, counts=[15, 8, 3, 1, 1]
    #         )
    #         lifespan = random.randint(4 * 365, 20 * 365)
    #         adoption_datetime = birth_datetime + datetime.timedelta(
    #             days=random.randint(0, 3 * 365)
    #         )
    #         adoption_date = adoption_datetime.isoformat()
    #         gender = random.sample(["male", "female", "unknown"], k=1, counts=[20, 20, 1])
    else:
        person_one_id = -1
        person_two_id = -1
        person_one_relationship = None
        person_two_relationship = None
        start_date = None
    #         pet_type = None
    #         lifespan = int(random.normalvariate(80, 15)) * 365 + random.randint(1, 365)
    #         adoption_date = None
    #         gender = random.sample(
    #             ["male", "female", "nonbinary", "other"], k=1, counts=[1, 1, 1, 1]
    #         )

    #     birth_date = birth_datetime.isoformat()
    #     death_datetime = birth_datetime + datetime.timedelta(days=lifespan)
    #     if death_datetime <= datetime.date.today():
    #         death_date = death_datetime.isoformat()
    #     else:
    #         death_date = None

    tmp_dict = {
        "id": i,
        "type": relationship_type,
        "person_one_id": person_one_id,
        "person_two_id": person_two_id,
        "person_one_relationship": person_one_relationship,
        "person_two_relationship": person_two_relationship,
        "start_date": start_date,
    }
    relationships_list.append(tmp_dict)
df_relationships = pd.DataFrame(relationships_list)
df_bad_marriage = df_relationships.loc[
    (df_relationships["type"] == "marriage")
    & (
        (df_relationships["person_one_relationship"].isna())
        | (df_relationships["start_date"].isna())
    ),
    "id",
].tolist()
df_relationships = df_relationships.loc[~df_relationships["id"].isin(df_bad_marriage)]
df_relationships.head(20)

Unnamed: 0,id,type,person_one_id,person_two_id,person_one_relationship,person_two_relationship,start_date
0,0,marriage,0,859,spouse,spouse,1993-03-19
2,2,marriage,2,176,spouse,spouse,2017-09-30
3,3,marriage,3,92,spouse,spouse,2017-02-25
4,4,marriage,4,284,spouse,spouse,2005-04-29
8,8,marriage,8,401,spouse,spouse,1995-07-07
9,9,marriage,9,923,spouse,spouse,2017-05-28
10,10,marriage,10,582,spouse,spouse,2012-03-23
12,12,marriage,12,94,spouse,spouse,2008-09-07
14,14,marriage,14,985,spouse,spouse,2011-07-17
16,16,marriage,16,189,spouse,spouse,2001-12-22


[1, 5, 6, 7, 8, 10, 11, 15, 17, 19]

# Old functions - dates in string format

In [30]:
def generate_person(birth_year: int = None, gender: str = None, sex: str = None):
    if gender is None:
        gender = random.sample(
            ["male", "female", "nonbinary", "other"], k=1, counts=[10, 10, 1, 1]
        )[0]
    elif gender not in ["male", "female", "nonbinary", "other"]:
        raise ValueError("gender must be one of male, female, nonbinary, or other")

    if sex is None:
        if gender == "male":
            sex = random.sample(["male", "female"], k=1, counts=[20, 1])[0]
        elif gender == "female":
            sex = random.sample(["female", "male"], k=1, counts=[20, 1])[0]
        else:
            sex = random.sample(["female", "male"], k=1, counts=[1, 1])[0]
    elif sex not in ["male", "female", "intersex"]:
        raise ValueError("sex must be one of male, female, or intersex")

    if birth_year is None:
        birth_year = random.sample(
            population=range(2025, 1880, -1),
            k=1,
            counts=[x for x in range(1, 2025 - 1880 + 1)],
        )[0]
    birth_datetime = (
        datetime.datetime.strptime(str(birth_year), "%Y")
        + datetime.timedelta(days=random.randint(0, 365))
    ).date()
    # formula based off of life expectancies from CDC (https://healthdata.gov/dataset/NCHS-Death-rates-and-life-expectancy-at-birth/4r8i-dqgb/about_data)
    life_expectancy = int(521.64 * np.log(birth_year) - 3887)
    lifespan = (
        life_expectancy + int(random.normalvariate(0, 15))
    ) * 365 + random.randint(1, 365)
    death_datetime = birth_datetime + datetime.timedelta(days=lifespan)
    if death_datetime <= datetime.date.today():
        death_date = death_datetime.isoformat()
    else:
        death_date = None

    if gender == "male":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
            & (df_first_names["sex"] == "m")
        ]
        df_hebrew_names_small = df_hebrew_names.loc[df_hebrew_names["sex"] == "m"]
    elif gender == "female":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
            & (df_first_names["sex"] == "f")
        ]
        df_hebrew_names_small = df_hebrew_names.loc[df_hebrew_names["sex"] == "f"]
    elif gender == "nonbinary":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
        ]
        df_hebrew_names_small = df_hebrew_names
    else:
        df_first_names_small = df_first_names
        df_hebrew_names_small = df_hebrew_names
    names = random.sample(
        df_first_names_small["name"].tolist(),
        k=5,
        counts=df_first_names_small["count"].tolist(),
    )
    first_name = names[0]
    middle_names_count = random.sample(range(3), 1)[0]
    middle_name = " ".join(names[1 : 1 + middle_names_count])
    hebrew_name_list = random.sample(
        df_hebrew_names_small["name"].tolist(),
        k=3,
        counts=df_hebrew_names_small["first_name_count"].tolist(),
    )
    non_english_name = " ".join(hebrew_name_list[0 : 1 + middle_names_count])
    last_name = df_jewish_surnames["surname"].sample().iloc[0]

    person_dict = {
        "first_name": first_name,
        "middle_name": middle_name,
        "last_name": last_name,
        "maiden_name": last_name,
        "preferred_name": first_name,
        "nickname": first_name,
        "non_english_name": non_english_name,
        "date_of_birth": birth_datetime.isoformat(),
        #         "date_of_adoption": adoption_date,
        "date_of_death": death_date,
        "gender": gender,
        "sex": sex,
    }
    return person_dict

In [8]:
def generate_pet(species: str = None, sex: str = None, birth_year: int = None):
    if species is None:
        species = random.sample(
            ["dog", "cat", "fish", "lizard", "bird"], k=1, counts=[15, 8, 3, 1, 1]
        )[0]
    elif species not in ["dog", "cat", "fish", "lizard", "bird"]:
        raise ValueError("species must be one of dog, cat, fish, lizard, or bird")

    if sex is None:
        sex = random.sample(["male", "female", "unknown"], k=1, counts=[10, 10, 1])[0]
    elif gender not in ["male", "female", "unknown"]:
        raise ValueError("gender must be one of male, female, or unknown")

    if birth_year is None:
        birth_year = random.sample(
            population=range(2025, 1880, -1),
            k=1,
            counts=[x for x in range(1, 2025 - 1880 + 1)],
        )[0]
    birth_datetime = (
        datetime.datetime.strptime(str(birth_year), "%Y")
        + datetime.timedelta(days=random.randint(0, 365))
    ).date()

    was_adopted = random.sample(population=[True, False], k=1, counts=[10, 1])[0]
    if was_adopted:
        adopted_datetime = birth_datetime + datetime.timedelta(
            days=random.randint(0, 365 * 4)
        )
        adoption_date = adopted_datetime.isoformat()
    else:
        adoption_date = None

    life_expectancy_dict = {"dog": 12, "cat": 16, "fish": 5, "lizard": 15, "bird": 20}
    life_expectancy = life_expectancy_dict[species]
    lifespan = (
        life_expectancy + int(random.normalvariate(0, life_expectancy / 5))
    ) * 365 + random.randint(1, 365)
    death_datetime = birth_datetime + datetime.timedelta(days=lifespan)
    if death_datetime <= datetime.date.today():
        death_date = death_datetime.isoformat()
    else:
        death_date = None

    if sex == "male":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
            & (df_first_names["sex"] == "m")
        ]
    elif sex == "female":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
            & (df_first_names["sex"] == "f")
        ]
    elif sex == "unknown":
        df_first_names_small = df_first_names.loc[
            (df_first_names["year"] == birth_datetime.year)
        ]
    else:
        df_first_names_small = df_first_names
    names = random.sample(
        df_first_names_small["name"].tolist(),
        k=5,
        counts=df_first_names_small["count"].tolist(),
    )
    first_name = names[0]
    middle_names_count = random.sample(population=range(3), k=1, counts=[5, 2, 1])[0]
    middle_name = " ".join(names[1 : 1 + middle_names_count])

    pet_dict = {
        "species": species,
        "first_name": first_name,
        "middle_name": middle_name,
        "date_of_birth": birth_datetime.isoformat(),
        "date_of_adoption": adoption_date,
        "date_of_death": death_date,
        "sex": sex,
    }
    return pet_dict

In [96]:
def generate_relationship(
    relationship_type: str = None, date_start: str = None, date_end: str = None
):
    if relationship_type is None:
        relationship_type = random.sample(
            ["parent_child", "partnership", "owner_pet"],
            k=1,
            counts=[10, 10, 1],
        )[0]
    elif relationship_type not in ["parent_child", "partnership", "owner_pet"]:
        raise ValueError(
            "relationship_type must be one of parent_child, partnership, or owner_pet"
        )
    relationship_subtype = None
    start_date = None
    end_date = None
    if relationship_type == "partnership":
        relationship_subtype = random.sample(
            ["marriage", "partnered"], k=1, counts=[10, 1]
        )[0]
        if date_start is None:
            start_year = random.sample(
                population=range(2025, 1900, -1),
                k=1,
                counts=[x for x in range(1, 2025 - 1900 + 1)],
            )[0]
        start_datetime = (
            datetime.datetime.strptime(str(start_year), "%Y")
            + datetime.timedelta(days=random.randint(0, 365))
        ).date()
        start_date = start_datetime.isoformat()

        duration = random.sample(population=range(5 * 365, 80 * 365), k=1)[0]

        end_datetime = start_datetime + datetime.timedelta(days=duration)
        if end_datetime <= datetime.date.today():
            end_date = end_datetime.isoformat()

    relationship_dict = {
        "type": relationship_type,
        "subtype":relationship_subtype,
        "start_date": start_date,
        "end_date": end_date,
    }
    return relationship_dict

In [97]:
def generate_tree(
    people: pd.DataFrame = None,
    relationships: pd.DataFrame = None,
    people_count: int = 0,
    generations: int = 0,
):
    if people is None:
        if people_count == 0:
            people_count = 50
        df_people = pd.DataFrame([generate_person() for _ in range(people_count)])
        df_people = df_people.reset_index(names="id")
    if relationships is None:
        relationship_count = int(people_count / 10)
        df_relationships = pd.DataFrame(
            [generate_relationship() for _ in range(relationship_count)]
        )
        df_relationships = df_relationships.reset_index(names="id")
    # Create partnerships    
    for idx, val in df_relationships.loc[df_relationships["type"]=="partnership"].iterrows():
        eligible_birth_date = (datetime.datetime.strptime(str(val.start_date), "%Y-%m-%d") - datetime.timedelta(days=19*365)).date().isoformat()
        if val.end_date is not None:
            eligibile_starter = df_people.loc[(df_people["date_of_birth"] < eligible_birth_date) & 
                                               ((df_people["date_of_death"] > val.end_date) | df_people["date_of_death"].isna())]
        else:
            eligibile_starter = df_people.loc[(df_people["date_of_birth"] < eligible_birth_date) & (df_people["date_of_death"].isna())]
        if eligibile_starter.shape[0]>0:
            partner_1 = random.sample(eligibile_starter["id"], k=1)[0]
            birth_range = (
            pd.to_datetime(df_people["date_of_birth"])
            - pd.to_datetime(df_people["date_of_birth"].iloc[i])
        ).apply(lambda x: abs(x.days)) < 10 * 365
            eligible_partners = 
            

    return df_people, df_relationships