In [18]:
from datetime import date
import pandas as pd
import numpy as np
import dataclasses
from typing import Dict, Optional
import more_itertools
import os

In [25]:
@dataclasses.dataclass(frozen=True)
class DataSource:
    """ 
    A dataset with columns to be renamed to be comibed with other DataSources.
    """
    
    data: pd.DataFrame
    column_mapping: Dict[str, str] # maps New Column Name -> Old Column Name
    name: Optional[str] = None
    
    def remap(self):
        """
        Rename and select a subset of data columns.
        """
        
        # Values mapped to None mean that value isn't available in the given dataset
        rename_dict = {v: k for k, v in self.column_mapping.items() if v is not None}
        
        df = self.data[rename_dict.keys()]
        df = df.rename(columns=rename_dict)
        
        if self.name is not None: 
            if "data_source" in df.columns:
                raise ValueError("df already contains data source column.")
            
            df["data_source"] = self.name
            #df["data_source"] = df["data_source"].astype("category")
            
        return df
    
def combine(*data_sources, require_matched_columns: bool = False) -> pd.DataFrame:
    """
    Concatenate DataSources into a single df.
    """
        
    column_names = [s.column_mapping.keys() for s in data_sources]
    if require_matched_columns and not more_intertools.all_equal(column_names):
        raise ValueError("All DataSources must list the same set of new column names.")
        
    processed = [data_source.remap() for data_source in data_sources]
    result = pd.concat(processed, ignore_index = True)
        
    if any([s.name for s in data_sources]):
        result["data_source"] = result["data_source"].astype("category")
            
    return result

In [3]:
path = "/".join(os.getcwd().split("/")[0:-1]) + "/web-scraping/"
print(path)
jurisdictions = os.listdir(path)

jurisdictions.remove(".DS_Store")
print(jurisdictions)

/Users/celiahealy/Documents/GitHub/indigent-burials/python/web-scraping/
['oregon-and-dignity-memorial', 'fresno-scraping', 'dona-ana-scraping', 'bernalillo-county', 'chicago-cookcounty', 'yakima-county', 'KingCounty', 'hart-island-web-scraper', 'la-county', 'namus']


In [4]:
for i in jurisdictions:
    sub_path = path + i + "/data/"
    files = os.listdir(sub_path)
    check = [i for i in files if ".csv" in i]
    print(check)

    

['dignity-memorial.csv', 'oregon.csv']
['fresno-cleaned3.csv', 'fresno-O-P.csv', 'fresno-G-H.csv', 'fresno-S-T.csv', 'fresno-Y-Z.csv', 'fresno-Q-R.csv', 'fresno-cleaned3-v2.csv', 'fresno-U-V.csv', 'fresno-W-X.csv', 'fresno-I-J.csv', 'fresno-M-N.csv', 'fresno-A-B.csv', 'fresno-E-F.csv', 'fresno.csv', 'fresno-C-D.csv', 'fresno-K-L.csv']
['dona-ana-cleaned-v2.csv', 'dona-ana-cleaned.csv', 'dona-ana.csv']
['bernalillo county_unclaimed.csv', 'bernalillo county_indigent_unfilters.csv']
['Medical_Examiner_-_Burial_Locations.csv', 'Medical_Examiner_Indigent_Cremations.csv']
['Unclaimed Remains_Yakima County.csv']
['KingCounty Seattle Burial Names  - Sheet1.csv']
[]
['LA Cremation_Merged Files.csv', 'LA Cremation Data Enhanced.csv']
['unclaimed_NY_states.csv', 'unclaimed_states_combined.csv', 'unclaimed_states.csv', 'unclaimed_states_combined_cleaned.csv']


In [26]:
#df = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/oregon-and-dignity-memorial/data/dignity-memorial.csv")

df.head()

Unnamed: 0,Last Name,First Name,Middle Name,Department Of Defense,DOB,DOD,Jurisdicition,Source URL,Date Scraped
0,Shegog,Will,Melvin,U.S. Air Force,3/24/1960,2/15/2019,Dignity Memorial,https://www.dignitymemorial.com/plan-funeral-c...,6/28/2022
1,Lundy,Ronnie,Joe,U.S. Army,2/4/1955,1/18/2019,Dignity Memorial,https://www.dignitymemorial.com/plan-funeral-c...,6/28/2022
2,Williams,Joseph,Lorenzo,U.S. Army,4/7/1960,4/1/2019,Dignity Memorial,https://www.dignitymemorial.com/plan-funeral-c...,6/28/2022
3,Ballantyne,Danny,Rollin,U.S. Marine Corps,11/15/1944,4/30/2019,Dignity Memorial,https://www.dignitymemorial.com/plan-funeral-c...,6/28/2022
4,Babcock,George,Charles,U.S. Army,2/17/1940,6/4/2019,Dignity Memorial,https://www.dignitymemorial.com/plan-funeral-c...,6/28/2022


In [204]:
### Read in each dataframe and normalize column names ###
dig_memorial = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/oregon-and-dignity-memorial/data/dignity-memorial.csv").rename(columns=lambda x: x.strip())
chicago_burials = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/chicago-cookcounty/data/Medical_Examiner_-_Burial_Locations.csv").rename(columns=lambda x: x.strip())
chicago_cremations = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/chicago-cookcounty/data/Medical_Examiner_Indigent_Cremations.csv").rename(columns=lambda x: x.strip())  
king_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/KingCounty/data/KingCounty%20Seattle%20Burial%20Names%20%20-%20Sheet1.csv").rename(columns=lambda x: x.strip())    
bernalillo = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/bernalillo-county/data/bernalillo%20county_indigent_unfilters.csv").rename(columns=lambda x: x.strip())  
dona_ana =  pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/dona-ana-scraping/data/dona-ana-cleaned-v2.csv").rename(columns=lambda x: x.strip())  
fresno = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/fresno-scraping/data/fresno-cleaned3-v2.csv").rename(columns=lambda x: x.strip())  
hart_island = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/hart-island-web-scraper/data/output/hart-island.csv").rename(columns=lambda x: x.strip())  
la_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/la-county/data/LA%20Cremation%20Data%20Enhanced.csv").rename(columns=lambda x: x.strip())  
namus = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/namus/data/unclaimed_states_combined_cleaned.csv").rename(columns=lambda x: x.strip())  
oregon = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/oregon-and-dignity-memorial/data/oregon.csv").rename(columns=lambda x: x.strip())  
yakima_county = pd.read_csv("https://raw.githubusercontent.com/IndigentBurials/indigent-burials/main/python/web-scraping/yakima-county/data/Unclaimed%20Remains_Yakima%20County.csv").rename(columns=lambda x: x.strip())  
#x = pd.read_csv().rename(columns=lambda x: x.strip())  

In [197]:
yakima_county.head()

Unnamed: 0,Last Name,First Name,Middle Name,Name (other),ID*,DOB:,DOD:,Location:,Unnamed: 8,Source:
0,,,,,,,,,,https://dhs.lacounty.gov/home-public-resources...
1,NICHOLS,Kathy,,,,11/2/47,1/28/22,Coroner's Office,,
2,ESTRADA,Arturo,,,,1/4/57,1/2/22,Coroner's Office,,Date Accessed: 6/23/22
3,ROMERO,HERNANDEZ,Victor,,,10/30/87,1/3/22,Coroner's Office,,
4,WOOD,Darlene,,,,6/24/49,1/21/21,Coroner's Office,,


In [207]:
def _map_yakima_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Washington"
    df["Jurisdiction"] = "Yakima County"
    df["SourceURL"] = "https://dhs.lacounty.gov/home-public-resources-locate-deceased-persons/"
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB:",
        "DOD": "DOD:",
        "State": "State",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

In [191]:
def _map_oregon(df: pd.DataFrame):
    df["LastModified"] = date.today()
    df["State"] = "Oregon"
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "BD": "Buried",
        "State": "State",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdicition",
        "LastModified": "LastModified",
    }
    return df_dict

In [185]:
def _map_namus(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "DOD": "DBF",
        "RaceEthnicity": "Race/Ethnicity",
        "Sex": "Sex",
        "City": "City",
        "County": "County",
        "State": "State",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdicition",
        "LastModified": "LastModified",
    }
    return df_dict

In [174]:
def _map_la_county(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last.Ne",
        "FName": "First.Ne",
        "MName": "Middle.Ne",
        "BD": "Date.of.Cremation..mm.dd.yyyy.",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

In [165]:
def _map_hart_island(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LName",
        "FName": "FName",
        "DOD": "DOD",
        "Age": "Age",
        "Sex": "Sex",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

In [156]:
def _map_fresno(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Age": "Age",
        "Sex": "Sex",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

In [150]:
def _map_dona_ana(df: pd.DataFrame):
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Age": "Age",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

In [139]:
def _map_bernalillo(df: pd.DataFrame):
    df["Jurisdiction"] = "Bernalill0"
    df["LastModified"] = date.today()
    df["isVeteran"] = df["Veteran"].replace({"Yes": 1, "No": 0})
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "OName": "Maiden Name",
        "DOB": "Date of Birth",
        "DOD": "Date of Death",
        "BD": "Cremation Date",
        "isVeteran": "isVeteran",
        "MilitaryAffiliation": "Military Branch",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    return df_dict

In [140]:
def _map_king_county(df: pd.DataFrame):
    df["Jurisdiction"] = "King County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "BD": "Year of Burial",
        "SourceURL": "Source URL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
    
    return df_dict

In [141]:
def _map_chicago_cremations(df: pd.DataFrame):
    df["Jurisdiction"] = "Chicago, Cook County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LastName",
        "FName": "Name",
        "MName": "MiddleName",
        "Age": "Age",
        "Sex": "Sex",
        "RaceEthnicity": "Race",
        "BD": "Cremation Date",
        "DOD": "Date of Death",
        "SourceURL": "SourceURL",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
        
    return df_dict
        
    

In [142]:
def _map_chicago_burials(df: pd.DataFrame):
    df["Jurisdiction"] = "Chicago, Cook County"
    df["LastModified"] = date.today()
    
    df_dict = {
        "LName": "LastName",
        "FName": "FirstName",
        "MName": "MiddleName",
        "Age": "Age",
        "Sex": "Sex",
        "RaceEthnicity": "Race",
        "BD": "Burial Date",
        "DOD": "Date of Death",
        "Jurisdiction": "Jurisdiction",
        "LastModified": "LastModified",
    }
        
    return df_dict
        
    

In [143]:
def _map_dignity_memorial(df: pd.DataFrame):
    df["isVeteran"] = 1
    df["LastModified"] = date.today()
    df["DOB"] = pd.to_datetime(df["DOB"], utc=True, errors='coerce').dt.date
        
    df_dict = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name",
        "DOB": "DOB",
        "DOD": "DOD",
        "Jurisdiction": "Jurisdicition",
        "SourceURL": "Source URL",
        "DateScraped": "Date Scraped",
        "LastModified": "LastModified",
        "isVeteran": "isVeteran",
        "MilitaryAffiliation": "Department Of Defense"
    }
    
    return df_dict

In [208]:
dig_memorial_dict = _map_dignity_memorial(dig_memorial)
chicago_burials_dict = _map_chicago_burials(chicago_burials)
chicago_cremations_dict =  _map_chicago_cremations(chicago_cremations)
king_county_dict = _map_king_county(king_county)
bernalillo_dict = _map_bernalillo(bernalillo)
dona_ana_dict = _map_dona_ana(dona_ana)
fresno_dict = _map_fresno(fresno)
hart_island_dict = _map_hart_island(hart_island)
la_county_dict = _map_la_county(la_county)
namus_dict = _map_namus(namus)
oregon_dict = _map_oregon(oregon)
yakima_county_dict = _map_yakima_county(yakima_county)

In [209]:
combined_df = combine(
    DataSource(dig_memorial, dig_memorial_dict, "Dignity Memorial"),
    DataSource(chicago_burials, chicago_burials_dict, "Chicago Burials"),
    DataSource(chicago_cremations, chicago_cremations_dict, "Chicago Cremations"),
    DataSource(king_county, king_county_dict, "King County"),
    DataSource(bernalillo, bernalillo_dict, "Bernalillo"),
    DataSource(dona_ana, dona_ana_dict, "Dona Ana"),
    DataSource(fresno, fresno_dict, "Fresno"),
    DataSource(hart_island, hart_island_dict, "Hart Island"),
    DataSource(la_county, la_county_dict, "LA County"),
    DataSource(namus, namus_dict, "National"),
    DataSource(oregon, oregon_dict, "Oregon"),
    DataSource(yakima_county, yakima_county_dict, "Yakima County")
)

In [210]:
combined_df.tail()

Unnamed: 0,LName,FName,MName,DOB,DOD,Jurisdiction,SourceURL,DateScraped,LastModified,isVeteran,MilitaryAffiliation,data_source,Age,Sex,RaceEthnicity,BD,OName,City,County,State
89895,HARLAN,Alice,,12/25/1864,7/10/35,Yakima County,https://dhs.lacounty.gov/home-public-resources...,,2022-10-28,,,Yakima County,,,,,,,,Washington
89896,THEIS,Anna,,1867,12/5/34,Yakima County,https://dhs.lacounty.gov/home-public-resources...,,2022-10-28,,,Yakima County,,,,,,,,Washington
89897,CAMPBELL,James,,8/27/11,1/25/25,Yakima County,https://dhs.lacounty.gov/home-public-resources...,,2022-10-28,,,Yakima County,,,,,,,,Washington
89898,NAUDEWATER,AC,,2/21/21,,Yakima County,https://dhs.lacounty.gov/home-public-resources...,,2022-10-28,,,Yakima County,,,,,,,,Washington
89899,BURIANEK,,,6/8/17,,Yakima County,https://dhs.lacounty.gov/home-public-resources...,,2022-10-28,,,Yakima County,,,,,,,,Washington


In [None]:
df["isVeteran"] = 1
    df["DeathYear"] = ""
    df["OName"] = ""
    df["Sex"] = ""
    df["RaceEthnicity"] = ""
    df["DBF"] = ""
    df["Mortuary"] = ""
    df["FuneralDirector"] =""
    df["CaseNo"] = ""
    df["PD"] = ""
    df["BD"] = ""
    df["BurialYear"] = ""
    df["CremationNo"] = ""
    df["GraveSection"] = ""
    df["GraveLot"] = ""
    df["GraveNo"] = ""
    df["Age"] = ""
        
    df_mapping = {
        "LName": "Last Name",
        "FName": "First Name",
        "MName": "Middle Name"
        "OName": "OName",
        "Sex": "Sex",
        "RaceEthnicity": "RaceEthnicity",
        "DOB": "DOB",
        "DOD": "DOD",
        "DeathYear":
        "DBF":
        "Mortuary":
        "FuneralDirector":
        "CaseNo":
        "PD":
        "BD":
        "BurialYear":
        "CremationNo":
        "GraveSection":
        "GraveLot":
        "GraveNo":
        "Age":
        "Jurisdiction": "Jurisdiction",
        "State":
        "County":
        "CountyNo":
        "City":
        "Location":
        "CityOfBirth":
        "StateOfBirth":
        "SourceURL": "SourceURL",
        "DateScraped": "DateScraped",
        "LastModified":
        "Misc":
        "isVeteran": "isVeteran"
        "MilitaryAffiliation": "Department of Defense"
    }