#  This is a shared notebook for Project I - Group 3

In [1]:
#Dependencies
import os
import pandas as pd
import calendar

In [2]:
csvDir = os.path.join("Resources")
col_list = ["month_of_death","day_of_week_of_death","current_data_year",
                              "manner_of_death","358_cause_recode","sex","detail_age"]
masterDf = pd.DataFrame()
files = os.listdir(csvDir)
for file in files:
    if file.endswith(".csv"):
        curCsv = os.path.join(csvDir,file)
        with open(curCsv) as file:
            x = pd.read_csv(file, usecols=col_list)
            masterDf = pd.concat([masterDf,x],axis=0)

In [6]:
#Filter to only deaths related to cars
car_death_data = masterDf[(masterDf["358_cause_recode"] >=385) & (masterDf["358_cause_recode"] <=398)]
car_death_data.head()

Unnamed: 0,month_of_death,sex,detail_age,day_of_week_of_death,current_data_year,manner_of_death,358_cause_recode
207,1,M,32,7,2005,1.0,396
208,1,M,75,5,2005,1.0,387
220,1,M,68,7,2005,1.0,396
234,2,M,21,1,2005,1.0,396
235,2,M,24,1,2005,1.0,396


In [7]:
#Clean up the dataframe so that it is readable
month_dict = {
      1:"January",
      2:"February",
      3:"March",
      4:"April",
      5:"May",
      6:"June",
      7:"July",
      8:"August",
      9:"September",
      10:"October",
      11:"November",
      12:"December"}
      
day_of_week_dict = {
      1:"Sunday",
      2:"Monday",
      3:"Tuesday",
      4:"Wednesday",
      5:"Thursday",
      6:"Friday",
      7:"Saturday",
      9:"Unknown"}

manner_of_death_dict = {
      1:"Accident",
      2:"Suicide",
      3:"Homicide",
      4:"Pending investigation",
      5:"Could not determine",
      6:"Self-Inflicted",
      7:"Natural"}
#       "Blank":"Not specified"}

cause_recode_dict = {
      385:" 385- Motor vehicle accidents",
      386:" 386- Pedestrian involved in collision with motor vehicle",
      387:" 387- Pedalcyclist involved in collision with motor vehicle",
      388:" 388- Motorcyclist involved in any accident except collision with railway train",
      389:" 389- Motor vehicle accident involving collision with railway train",
      390:" 390- Motorcyclist involved in collision with railway train",
      391:" 391- Other motor vehicle accident involving collision with railway train",
      392:" 392- Occupant of motor vehicle involved in collision with other (non- motorized) road vehicle, streetcar, animal or pedestrian",
      393:" 393- Occupant of car, pickup truck or van involved in collision with other motor vehicle",
      394:" 394- Occupant of heavy transport vehicle or bus involved in collision with other motor vehicle",
      395:" 395- Occupant of motor vehicle involved in non-collision accident",
      396:" 396- Occupant of special-use motor vehicle involved in any accident",
      397:" 397- Other and unspecified motor vehicle accidents",
      398:" 398- Streetcar accidents"}
    

clean_df = car_death_data.replace({"month_of_death": month_dict, 
                        "day_of_week_of_death": day_of_week_dict,
                        "manner_of_death": manner_of_death_dict,
                        "358_cause_recode": cause_recode_dict})

clean_df

Unnamed: 0,month_of_death,sex,detail_age,day_of_week_of_death,current_data_year,manner_of_death,358_cause_recode
207,January,M,32,Saturday,2005,Accident,396- Occupant of special-use motor vehicle in...
208,January,M,75,Thursday,2005,Accident,387- Pedalcyclist involved in collision with ...
220,January,M,68,Saturday,2005,Accident,396- Occupant of special-use motor vehicle in...
234,February,M,21,Sunday,2005,Accident,396- Occupant of special-use motor vehicle in...
235,February,M,24,Sunday,2005,Accident,396- Occupant of special-use motor vehicle in...
...,...,...,...,...,...,...,...
2717184,December,M,67,Friday,2015,Accident,397- Other and unspecified motor vehicle acci...
2717278,December,F,77,Thursday,2015,Accident,397- Other and unspecified motor vehicle acci...
2717674,December,M,73,Wednesday,2015,Accident,397- Other and unspecified motor vehicle acci...
2717998,December,M,70,Thursday,2015,Accident,395- Occupant of motor vehicle involved in no...


In [8]:
#  check quality of data
#  list all unique values in each columns

colNames = list(clean_df.columns)
for col in colNames:
    print(col)
    print(f"{clean_df[col].unique()}")
    print("----------")

month_of_death
['January' 'February' 'March' 'April' 'June' 'May' 'July' 'August'
 'September' 'October' 'November' 'December']
----------
sex
['M' 'F']
----------
detail_age
[ 32  75  68  21  24  25  44  49  40  11  14  64  57  18   9  54  31  42
  27  43  58  46  33  22  13  61  56  16  53  30  23  26  20  83  34  38
  85  15   1  47  65  72  52  17  28  12  19  48  45  50  66  81  41  62
  59  69  39  37  63  78  80  35  73  36  90 999  82  55  51   2  60  70
  29  91  79   7  84  76   8   5  77  89  74   4  10  88  86   6   3  71
  67  92 101  94  87  93  97  96 102 100  99  95  98 106 104 103 107]
----------
day_of_week_of_death
['Saturday' 'Thursday' 'Sunday' 'Tuesday' 'Wednesday' 'Monday' 'Friday'
 'Unknown']
----------
current_data_year
[2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015]
----------
manner_of_death
['Accident' 'Natural' 'Could not determine' 'Homicide' nan
 'Pending investigation' 'Suicide']
----------
358_cause_recode
[' 396- Occupant of special-use motor 

In [20]:
#  Clean up
#  Remove not logical data 
# ie: age of 999, day of week : Unknown, manner_of_death nan, need filter by Accident
finalDf = clean_df[clean_df["detail_age"] != 999]
finalDf = finalDf[finalDf["manner_of_death"] == "Accident"]
finalDf =  finalDf[finalDf["day_of_week_of_death"] != "Unknown"]

colNames = list(finalDf.columns)
for col in colNames:
    print(col)
    print(f"{finalDf[col].unique()}")
    print("----------")

month_of_death
['January' 'February' 'March' 'April' 'June' 'May' 'July' 'August'
 'September' 'October' 'November' 'December']
----------
sex
['M' 'F']
----------
detail_age
[ 32  75  68  21  24  25  44  49  40  11  14  64  57  18   9  54  31  42
  27  43  58  46  33  22  13  56  16  53  30  23  26  20  83  34  38  85
  15   1  47  65  72  52  17  28  12  19  48  45  50  66  81  41  62  59
  69  39  37  63  78  80  35  73  36  90  61  82  55  51   2  60  70  29
  79   7  84  76   8   5  77  89  74   4  10  88  86   6   3  71  67  92
  94  91  87  93  97  96 102 100  99  95  98 101 106 104 103]
----------
day_of_week_of_death
['Saturday' 'Thursday' 'Sunday' 'Tuesday' 'Wednesday' 'Monday' 'Friday']
----------
current_data_year
[2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015]
----------
manner_of_death
['Accident']
----------
358_cause_recode
[' 396- Occupant of special-use motor vehicle involved in any accident'
 ' 387- Pedalcyclist involved in collision with motor vehicle'
 ' 3

In [None]:
masterDf['month_of_death'] = masterDf['month_of_death'].apply(lambda x: calendar.month_abbr[x])
masterDf.head()

In [None]:
# def of num of day doesnt matches 
dayName = list(calendar.day_name)
dayName

In [None]:
# read json
jsonDf = pd.DataFrame()
for root,dirs,files in os.walk(csvDir):
    for jsonFile in files:
       if jsonFile.endswith(".json"):
            curJson = os.path.join(root,jsonFile)
            print(curJson)
            y = pd.read_json(curJson)
            jsonDf = pd.concat([jsonDf,y],axis=0)
jsonDf.head(20)