In [4]:
from datetime import date
import numpy as np
import pandas as pd
import dataclasses
from typing import Dict, Optional
import more_itertools
import os


In [5]:
@dataclasses.dataclass(frozen=True)
class DataSource:
    """ 
    A dataset with columns to be renamed to be comibed with other DataSources.
    """
    
    data: pd.DataFrame
    column_mapping: Dict[str, str] # maps New Column Name -> Old Column Name
    name: Optional[str] = None
    
    def remap(self):
        """
        Rename and select a subset of data columns.
        """
        
        # Values mapped to None mean that value isn't available in the given dataset
        rename_dict = {v: k for k, v in self.column_mapping.items() if v is not None}
        
        df = self.data[rename_dict.keys()]
        df = df.rename(columns=rename_dict)
        
        if self.name is not None: 
            if "data_source" in df.columns:
                raise ValueError("df already contains data source column.")
            
            df["data_source"] = self.name
            #df["data_source"] = df["data_source"].astype("category")
            
        return df
    
def combine(*data_sources, require_matched_columns: bool = False) -> pd.DataFrame:
    """
    Concatenate DataSources into a single df.
    """
        
    column_names = [s.column_mapping.keys() for s in data_sources]
    if require_matched_columns and not more_itertools.all_equal(column_names):
        raise ValueError("All DataSources must list the same set of new column names.")
        
    processed = [data_source.remap() for data_source in data_sources]
    result = pd.concat(processed, ignore_index = True)
        
    if any([s.name for s in data_sources]):
        result["data_source"] = result["data_source"].astype("category")
            
    return result

In [30]:
dig_memorial = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/oregon-and-dignity-memorial/data/dignity-memorial.csv").rename(columns=lambda x: x.strip())
chicago_burials = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/chicago-cookcounty/data/Medical_Examiner_-_Burial_Locations.csv").rename(columns=lambda x: x.strip())
chicago_cremations = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/chicago-cookcounty/data/Medical_Examiner_Indigent_Cremations.csv").rename(columns=lambda x: x.strip())  
king_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/KingCounty/data/KingCounty%20Seattle%20Burial%20Names%20%20-%20Sheet1.csv").rename(columns=lambda x: x.strip())    
bernalillo = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/bernalillo-county/data/bernalillo%20county_indigent_unfilters.csv").rename(columns=lambda x: x.strip())  
dona_ana =  pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/dona-ana-scraping/data/dona-ana-cleaned-v2.csv").rename(columns=lambda x: x.strip())  
fresno = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/fresno-scraping/data/fresno-cleaned3-v2.csv").rename(columns=lambda x: x.strip())  
hart_island = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/hart-island-web-scraper/data/output/hart-island.csv").rename(columns=lambda x: x.strip())  
la_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/la-county/data/LA%20Cremation%20Data%20Enhanced.csv").rename(columns=lambda x: x.strip())  
namus = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/namus/data/unclaimed_states_combined_cleaned.csv").rename(columns=lambda x: x.strip())  
oregon = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/oregon-and-dignity-memorial/data/oregon.csv").rename(columns=lambda x: x.strip())  
yakima_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/yakima-county/data/Unclaimed%20Remains_Yakima%20County.csv").rename(columns=lambda x: x.strip())  
yellowstone = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/yellowstone_mt/Riverside%20Indigent%20Burials%202021.csv").rename(columns=lambda x: x.strip())  

In [31]:
yellowstone

Unnamed: 0,LNAME,FNAME,MNAME,DOB,DOD,DOI,CEMETERY,CSITEID,CAUSEDEATH,MEMO,...,DOIYR,DOIMON,DOIDAY,PLBIRTH,MSTATUS,TIMEINT,CREMCODE,DISINTERCODE,DEATH_CERT_NUM,LastModifiedDate
0,Buirge,Barbara,W,5/2/04,5/3/94,5/23/18,RIV,RIV_7_L_17,,,...,2018.0,5.0,23.0,"Princeton, MO",,10:00:AM,x,,,
1,Howell,Charles,Olan,8/13/46,11/16/11,5/23/18,RIV,RIV_7_M_17,,,...,2018.0,5.0,23.0,Baltimore ME,,10:00 AM,x,,,
2,Hughes,Mary,Jane,1/4/27,2/13/82,5/23/18,RIV,RIV_7_A_16,,,...,2018.0,5.0,23.0,"Crawford, NE",,10:00 AM,,,,
3,Linville,Miriam,E,8/30/07,7/23/81,5/23/18,RIV,RIV_7_B_16,,,...,2018.0,5.0,23.0,"Boone, IA",,10:00 AM,x,,,
4,Milliken,Gail,,2/5/19,12/23/87,5/23/18,RIV,RIV_7_C_16,,,...,2018.0,5.0,23.0,,,10:00 AM,x,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,Muniyan,Mark,Anthony,2/9/78,11/27/19,5/19/21,RIV,RIV_7_K_12,Suicide,,...,2021.0,5.0,19.0,Alaska,Single,,,,1114879.0,11:39.0
121,Mirko,Joan,E,2/3/43,1/31/21,5/19/21,RIV,RIV_7_J_12,Natural causes,,...,2021.0,5.0,19.0,"Long Beach, CA",Divorced,,,,1230903.0,12:13.0
122,Arcena,Thomas,,8/18/48,8/3/20,5/19/21,RIV,RIV_7_H_12,Natural,,...,2021.0,5.0,19.0,"San Diego, CA",Divorced,,,,1176513.0,17:36.0
123,Price,Darcy,Lea,4/22/20,7/31/60,5/19/21,RIV,RIV_7_G_12,Natural,,...,2021.0,5.0,19.0,"Virginia, MN",Married,,,,1186240.0,18:38.0


In [8]:
def _map_yellowstone(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Montana"
    df["Jurisdiction"] = "Riverside County"
    
    df_dict = {
        "Lname": "LNAME",
        "Fname": "FNAME",
        "MName": "MNAME",
        "DOB": "DOB",
        "DOD": "DOD",
        "DB": "DOI",
        "Cemet"
    }

def _map_yakima_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Washington"
    df["Jurisdiction"] = "Yakima County"
    df["SourceURL"] = "https://dhs.lacounty.gov/home-public-resources-locate-deceased-persons/"
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB:",
        "DOD": "DOD:",
        "State": "State",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_oregon(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Oregon"
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "BD": "Buried",
        "State": "State",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdicition",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_namus(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "DOD": "DBF",
        "RaceEthnicity": "Race/Ethnicity",
        "Sex": "Sex",
        "City": "City",
        "County": "County",
        "State": "State",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdicition",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_la_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last.Ne",
        "FName": "First.Ne",
        "MName": "Middle.Ne",
        "BD": "Date.of.Cremation..mm.dd.yyyy.",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_hart_island(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LName",
        "FName": "FName",
        "DOD": "DOD",
        "Age": "Age",
        "Sex": "Sex",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_fresno(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Age": "Age",
        "Sex": "Sex",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_dona_ana(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Age": "Age",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_bernalillo(df: pd.DataFrame):
    df["Jurisdiction"] = "Bernalill0"
    df["LastModified"] = date.today()
    df["isVeteran"] = df["Veteran"].replace({"Yes": 1, "No": 0})
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "OName": "Maiden Name",
        "DOB": "Date of Birth",
        "DOD": "Date of Death",
        "BD": "Cremation Date",
        "isVeteran": "isVeteran",
        "MilitaryAffiliation": "Military Branch",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

def _map_king_county(df: pd.DataFrame):
    df["Jurisdiction"] = "King County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "BD": "Year of Burial",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    
    return df_dict

def _map_chicago_cremations(df: pd.DataFrame):
    df["Jurisdiction"] = "Chicago, Cook County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LastName",
        "FName": "Name",
        "MName": "MiddleName",
        "Age": "Age",
        "Sex": "Sex",
        "RaceEthnicity": "Race",
        "BD": "Cremation Date",
        "DOD": "Date of Death",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
        
    return df_dict

def _map_dignity_memorial(df: pd.DataFrame):
    df["isVeteran"] = 1
    df["LastModified"] = date.today()
    df["DOB"] = pd.to_datetime(df["DOB"], utc=True, errors='coerce').dt.date
        
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Jurisdiction": "Jurisdicition",
        "SourceURL": "Source URL",
        "DateScraped": "Date Scraped",
        "LastModified": "LastModified",
        "isVeteran": "isVeteran",
        "MilitaryAffiliation": "Department Of Defense"
    }
    
    return df_dict

def _map_chicago_burials(df: pd.DataFrame):
    df["Jurisdiction"] = "Chicago, Cook County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LastName",
        "FName": "FirstName",
        "MName": "MiddleName",
        "Age": "Age",
        "Sex": "Sex",
        "RaceEthnicity": "Race",
        "BD": "Burial Date",
        "DOD": "Date of Death",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
        
    return df_dict
            
def _map_yakima_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Washington"
    df["Jurisdiction"] = "Yakima County"
    df["SourceURL"] = "https://dhs.lacounty.gov/home-public-resources-locate-deceased-persons/"
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB:",
        "DOD": "DOD:",
        "State": "State",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict


In [10]:
dig_memorial_dict = _map_dignity_memorial(dig_memorial)
chicago_burials_dict = _map_chicago_burials(chicago_burials)
chicago_cremations_dict =  _map_chicago_cremations(chicago_cremations)
king_county_dict = _map_king_county(king_county)
bernalillo_dict = _map_bernalillo(bernalillo)
dona_ana_dict = _map_dona_ana(dona_ana)
fresno_dict = _map_fresno(fresno)
hart_island_dict = _map_hart_island(hart_island)
la_county_dict = _map_la_county(la_county)
namus_dict = _map_namus(namus)
oregon_dict = _map_oregon(oregon)
yakima_county_dict = _map_yakima_county(yakima_county)

In [11]:
combined_df = combine(
        DataSource(dig_memorial, dig_memorial_dict, "Dignity Memorial"),
        DataSource(chicago_burials, chicago_burials_dict, "Chicago Burials"),
        DataSource(chicago_cremations, chicago_cremations_dict, "Chicago Cremations"),
        DataSource(king_county, king_county_dict, "King County"),
        DataSource(bernalillo, bernalillo_dict, "Bernalillo"),
        DataSource(dona_ana, dona_ana_dict, "Dona Ana"),
        DataSource(fresno, fresno_dict, "Fresno"),
        DataSource(hart_island, hart_island_dict, "Hart Island"),
        DataSource(la_county, la_county_dict, "LA County"),
        DataSource(namus, namus_dict, "National"),
        DataSource(oregon, oregon_dict, "Oregon"),
        DataSource(yakima_county, yakima_county_dict, "Yakima County")
    )

In [23]:
combined_df.to_csv("/Users/celiahealy/Documents/GitHub/indigent-burials/python/data/indigent_burials_main.csv")


TypeError: can only concatenate str (not "datetime.date") to str

In [29]:
today = date.today()
date_path = today.strftime("%Y_%m_%d")
archive_path = "/Users/celiahealy/Documents/GitHub/indigent-burials/python/data/archive/indigent_burials_main_"
save_path = archive_path + date_path + ".csv"
combined_df.to_csv(save_path)
