In [4]:
from datetime import date
import numpy as np
import pandas as pd
import dataclasses
from typing import Dict, Optional
import more_itertools
import os


In [5]:
@dataclasses.dataclass(frozen=True)
class DataSource:
    """ 
    A dataset with columns to be renamed to be comibed with other DataSources.
    """
    
    data: pd.DataFrame
    column_mapping: Dict[str, str] # maps New Column Name -> Old Column Name
    name: Optional[str] = None
    
    def remap(self):
        """
        Rename and select a subset of data columns.
        """
        
        # Values mapped to None mean that value isn't available in the given dataset
        rename_dict = {v: k for k, v in self.column_mapping.items() if v is not None}
        
        df = self.data[rename_dict.keys()]
        df = df.rename(columns=rename_dict)
        
        if self.name is not None: 
            if "data_source" in df.columns:
                raise ValueError("df already contains data source column.")
            
            df["data_source"] = self.name
            #df["data_source"] = df["data_source"].astype("category")
            
        return df
    
def combine(*data_sources, require_matched_columns: bool = False) -> pd.DataFrame:
    """
    Concatenate DataSources into a single df.
    """
        
    column_names = [s.column_mapping.keys() for s in data_sources]
    if require_matched_columns and not more_itertools.all_equal(column_names):
        raise ValueError("All DataSources must list the same set of new column names.")
        
    processed = [data_source.remap() for data_source in data_sources]
    result = pd.concat(processed, ignore_index = True)
        
    if any([s.name for s in data_sources]):
        result["data_source"] = result["data_source"].astype("category")
            
    return result

In [70]:
dig_memorial = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/oregon-and-dignity-memorial/data/dignity-memorial.csv").rename(columns=lambda x: x.strip())
chicago_burials = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/chicago-cookcounty/data/Medical_Examiner_-_Burial_Locations.csv").rename(columns=lambda x: x.strip())
chicago_cremations = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/chicago-cookcounty/data/Medical_Examiner_Indigent_Cremations.csv").rename(columns=lambda x: x.strip())  
king_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/KingCounty/data/KingCounty%20Seattle%20Burial%20Names%20%20-%20Sheet1.csv").rename(columns=lambda x: x.strip())    
bernalillo = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/bernalillo-county/data/bernalillo%20county_indigent_unfilters.csv").rename(columns=lambda x: x.strip())  
dona_ana =  pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/dona-ana-scraping/data/dona-ana-cleaned-v2.csv").rename(columns=lambda x: x.strip())  
fresno = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/fresno-scraping/data/fresno-cleaned3-v2.csv").rename(columns=lambda x: x.strip())  
hart_island = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/hart-island-web-scraper/data/output/hart-island.csv").rename(columns=lambda x: x.strip())  
la_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/la-county/data/LA%20Cremation%20Data%20Enhanced.csv").rename(columns=lambda x: x.strip())  
namus = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/namus/data/unclaimed_states_combined_cleaned.csv").rename(columns=lambda x: x.strip())  
oregon = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/oregon-and-dignity-memorial/data/oregon.csv").rename(columns=lambda x: x.strip())  
yakima_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/yakima-county/data/Unclaimed%20Remains_Yakima%20County.csv").rename(columns=lambda x: x.strip())  
yellowstone = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/yellowstone_mt/Riverside%20Indigent%20Burials%202021.csv").rename(columns=lambda x: x.strip())  
oakland_mi = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/oakland_mi/oakland%2Cmi%20data.csv").rename(columns=lambda x: x.strip()) 
orange_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/orange_county_ca/Orangecounty1999-2022.csv").rename(columns=lambda x: x.strip()) 
pima_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/pima_county_az/pimacountyAZ.csv").rename(columns=lambda x: x.strip()) 

In [71]:
pima_county

Unnamed: 0,Identification - Undocumented border crosser,Identification - Last name,Identification - First name,Identification - Middle name,Identification - Suffix,Identification - Homeless,Identification - Date of birth,Identification - Date of death,Identification - Age,Identification - Age months,Identification - Age days,Identification - Veteran status,Indigent Interment - Disposition approved for,Indigent Interment - Interment location
0,False,Coomes,Larry,L,,No,5/15/32 0:00,1/10/21 0:30,88.0,7.0,26.0,Yes,Cremation,"East Lawn Cemetery Sheldon, IA"
1,False,Morton,Winsome,Rita,,No,10/12/48 0:00,2/22/21 18:00,72.0,4.0,10.0,No,Cremation,East Lawn Palms Cemetery
2,False,Wood,Cheryl,L,,No,,8/2/18 0:00,,,,No,Burial,East Lawn Palms Cemetery
3,False,Richardson,Kinneret,Sarah,,No,10/10/45 0:00,7/29/19 9:15,73.0,9.0,18.0,No,Burial,Evergreen Cemetery
4,False,O'Mara,Roger,Emil,,No,8/21/36 0:00,3/28/20 1:00,83.0,7.0,7.0,No,Cremation,"Evergreen Cemetery, decedent owns a plot."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089,False,Young,John,Maurice,,Yes,6/3/54 0:00,4/2/22 9:13,67.0,9.0,30.0,Yes,Cremation,Southern AZ VA Memorial Cemetery in Marana
1090,False,Zimmerman,Harold,Paul,,No,2/18/40 0:00,4/9/20 18:55,80.0,1.0,22.0,Yes,Cremation,Southern AZ VA Memorial Cemetery in Marana
1091,False,Lynch,Rex,,Jr.,No,2/7/43 0:00,4/16/20 7:49,77.0,2.0,9.0,Yes,Cremation,Southern AZ VA Memorial Cemetery in Marana
1092,False,LAWSON,DENNIS,WARREN,,No,10/28/50 0:00,8/31/15 17:53,64.0,10.0,3.0,Yes,Cremation,Southern AZ VA Memorial Cemetery in Sierra Vista


In [72]:
def _map_pima_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Arizona"
    df["Jurisdiction"] = "Pima County"
    df["isVeteran"] = df["Identification - Veteran status"].replace({"Yes": 1, "No": 0})
    df_dict = {
        "LName": "Identification - Last name",
        "FName": "Identification - First name",
        "MName": "Identification - Middle name",
        "DOD": "Identification - Date of death",
        "DOB": "Identification - Date of birth",
        "Age": "Identification - Age",
        "isVeteran": "isVeteran",
        "Sate": "State",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict


def _map_orange_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "California"
    df["Jurisdiction"] = "Orange County"
    df_dict = {
        "LName": "NameLast",
        "FName": "NameFirst",
        "MName": "NameMiddle",
        "DOD": "DeathDate",
        "Sate": "State",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_oakland_mi(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Missouri"
    df["Jurisdiction"] = "Oakland"
    df[['FName', 'LName']] = df['Name'].str.split(n=1, expand=True)
    
    df_dict = {
        "LName": "LName",
        "FName": "FName",
        "DOD": "Date of Death",
        "Sex": "Gender",
        "Age": "Age",
        "Race": "Race",
        "Sate": "State",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_yellowstone(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Montana"
    df["Jurisdiction"] = "Riverside County"
    
    df_dict = {
        "LName": "LNAME",
        "FName": "FNAME",
        "MName": "MNAME",
        "DOB": "DOB",
        "DOD": "DOD",
        "DB": "DOI",
        "Sex": "SEX",
        "Age": "AGEYR",
        "LastResidence": "LASTRESID",
        "PlaceOfBirth": "PLBIRTH",
        "MaritalStatus": "MSTATUS",
        "Cremated": "CREMCODE",
        "State": "State",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_yakima_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Washington"
    df["Jurisdiction"] = "Yakima County"
    df["SourceURL"] = "https://dhs.lacounty.gov/home-public-resources-locate-deceased-persons/"
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB:",
        "DOD": "DOD:",
        "State": "State",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_oregon(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Oregon"
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "BD": "Buried",
        "State": "State",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdicition",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_namus(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "DOD": "DBF",
        "RaceEthnicity": "Race/Ethnicity",
        "Sex": "Sex",
        "City": "City",
        "County": "County",
        "State": "State",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdicition",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_la_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "California"
    
    df_dict = {
        "LName": "Last.Ne",
        "FName": "First.Ne",
        "MName": "Middle.Ne",
        "BD": "Date.of.Cremation..mm.dd.yyyy.",
        "SourceURL": "Source URL",
        "State": "State",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_hart_island(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LName",
        "FName": "FName",
        "DOD": "DOD",
        "Age": "Age",
        "Sex": "Sex",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_fresno(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Age": "Age",
        "Sex": "Sex",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_dona_ana(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Age": "Age",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_bernalillo(df: pd.DataFrame):
    df["Jurisdiction"] = "Bernalill0"
    df["LastModified"] = date.today()
    df["isVeteran"] = df["Veteran"].replace({"Yes": 1, "No": 0})
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "OName": "Maiden Name",
        "DOB": "Date of Birth",
        "DOD": "Date of Death",
        "BD": "Cremation Date",
        "isVeteran": "isVeteran",
        "MilitaryAffiliation": "Military Branch",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_king_county(df: pd.DataFrame):
    df["Jurisdiction"] = "King County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "BD": "Year of Burial",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    
    return df_dict

def _map_chicago_cremations(df: pd.DataFrame):
    df["Jurisdiction"] = "Chicago, Cook County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LastName",
        "FName": "Name",
        "MName": "MiddleName",
        "Age": "Age",
        "Sex": "Sex",
        "RaceEthnicity": "Race",
        "BD": "Cremation Date",
        "DOD": "Date of Death",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
        
    return df_dict

def _map_dignity_memorial(df: pd.DataFrame):
    df["isVeteran"] = 1
    df["LastModified"] = date.today()
    df["DOB"] = pd.to_datetime(df["DOB"], utc=True, errors='coerce').dt.date
        
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Jurisdiction": "Jurisdicition",
        "SourceURL": "Source URL",
        "DateScraped": "Date Scraped",
        "LastModified": "LastModified",
        "isVeteran": "isVeteran",
        "MilitaryAffiliation": "Department Of Defense"
    }
    
    return df_dict

def _map_chicago_burials(df: pd.DataFrame):
    df["Jurisdiction"] = "Chicago, Cook County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LastName",
        "FName": "FirstName",
        "MName": "MiddleName",
        "Age": "Age",
        "Sex": "Sex",
        "RaceEthnicity": "Race",
        "BD": "Burial Date",
        "DOD": "Date of Death",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
        
    return df_dict
            
def _map_yakima_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Washington"
    df["Jurisdiction"] = "Yakima County"
    df["SourceURL"] = "https://dhs.lacounty.gov/home-public-resources-locate-deceased-persons/"
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB:",
        "DOD": "DOD:",
        "State": "State",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict


In [73]:
dig_memorial_dict = _map_dignity_memorial(dig_memorial)
chicago_burials_dict = _map_chicago_burials(chicago_burials)
chicago_cremations_dict =  _map_chicago_cremations(chicago_cremations)
king_county_dict = _map_king_county(king_county)
bernalillo_dict = _map_bernalillo(bernalillo)
dona_ana_dict = _map_dona_ana(dona_ana)
fresno_dict = _map_fresno(fresno)
hart_island_dict = _map_hart_island(hart_island)
la_county_dict = _map_la_county(la_county)
namus_dict = _map_namus(namus)
oregon_dict = _map_oregon(oregon)
yakima_county_dict = _map_yakima_county(yakima_county)
yellowstone_dict = _map_yellowstone(yellowstone)
oakland_mi_dict = _map_oakland_mi(oakland_mi)
orange_county_dict = _map_orange_county(orange_county)
pima_county_dict = _map_pima_county(pima_county)

In [74]:
combined_df = combine(
    DataSource(dig_memorial, dig_memorial_dict, "Dignity Memorial"),
    DataSource(chicago_burials, chicago_burials_dict, "Chicago Burials"),
    DataSource(chicago_cremations, chicago_cremations_dict, "Chicago Cremations"),
    DataSource(king_county, king_county_dict, "King County"),
    DataSource(bernalillo, bernalillo_dict, "Bernalillo"),
    DataSource(dona_ana, dona_ana_dict, "Dona Ana"),
    DataSource(fresno, fresno_dict, "Fresno"),
    DataSource(hart_island, hart_island_dict, "Hart Island"),
    DataSource(la_county, la_county_dict, "LA County"),
    DataSource(namus, namus_dict, "National"),
    DataSource(oregon, oregon_dict, "Oregon"),
    DataSource(yakima_county, yakima_county_dict, "Yakima County"),
    DataSource(yellowstone, yellowstone_dict, "Yellowstone"),
    DataSource(oakland_mi, oakland_mi_dict, "Oakland MI"),
    DataSource(orange_county, orange_county_dict, "Orange County"),
    DataSource(pima_county, pima_county_dict, "Pima County"),
    ).dropna()

In [75]:
combined_df.to_csv("/Users/celiahealy/Documents/GitHub/indigent-burials/python/data/indigent_burials_main.csv")


In [76]:
today = date.today()
date_path = today.strftime("%Y_%m_%d")
archive_path = "/Users/celiahealy/Documents/GitHub/indigent-burials/python/data/archive/indigent_burials_main_"
save_path = archive_path + date_path + ".csv"
combined_df.to_csv(save_path)
