In [1]:
import json
import string
import requests
from requests.exceptions import Timeout
from pprint import pprint
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from scipy.stats import linregress
import csv 
import io


In [2]:
######get datasets to explore 
#beginning year
dataset = ['ACCIDENT','DISTRACT','PERSON','VIOLATION']
baseurl = 'https://crashviewer.nhtsa.dot.gov/CrashAPI/FARSData/GetFARSData'

for x in dataset:
    yearloop = 2010
    params = {'states':'1,51',
            'FromYear':yearloop,
            'ToYear':yearloop,
            'dataset':x,
            'format':'csv'}
    for i in range (12):
        #wait up to 70 seconds for responding data
        try:
            response = requests.get(baseurl,params=params,timeout=(10,70))
        except Timeout:
            print('The request timed out')

        if response.status_code !=200:
            print('There is error')
        else:
            #get response as CSV
            response.encoding = response.apparent_encoding
            #get data in string
            content = response.text
            print(yearloop)
            #convert string to pandas dataframe
            temp_data = pd.read_csv(io.StringIO(content))
            if i ==0:
                #response_json = response.iter_lines()
                #save data to fatality_data if 1st data
                fatality_data = pd.read_csv(io.StringIO(content))           
            else:
                #response_json = response.iter_lines()
                #append 2nd data point on ward to fatality_data
                temp_data = pd.read_csv(io.StringIO(content))
                fatality_data = pd.concat([fatality_data,temp_data],ignore_index=True) 
        #update parameter for next year
        yearloop += 1
        params = {'states':'1,51',
            'FromYear':yearloop,
            'ToYear':yearloop,
            'dataset':x,
            'format':'csv'}

    #export final data to csv
    fatality_data.to_csv(f'data/{x}.csv',index=False)

2010
2011
2012
2013


  exec(code_obj, self.user_global_ns, self.user_ns)


2014
2015


  exec(code_obj, self.user_global_ns, self.user_ns)


2016
2017
2018
2019
2020
2021
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [3]:
###### trim off data to reduce the file as the raw data is huge (>1GB)
df = pd.read_csv('data/ACCIDENT.csv')
df =df[['caseyear','st_case','statename','latitude','longitud','fatals','lgt_condname','drunk_dr','fatals','day_weekname','day_week']]
df.to_csv('data/ACCIDENT.csv',index=False)
df.head()


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,caseyear,st_case,statename,latitude,longitud,fatals,lgt_condname,drunk_dr,fatals.1,day_weekname,day_week
0,2010,10001,Alabama,32.641064,-85.354692,1,Dark - Lighted,1,1,Friday,6
1,2010,10002,Alabama,31.430447,-86.956694,1,Dark - Lighted,0,1,Monday,2
2,2010,10003,Alabama,30.691631,-88.085778,1,Daylight,0,1,Thursday,5
3,2010,10004,Alabama,33.8687,-86.291164,1,Dark - Lighted,0,1,Thursday,5
4,2010,10005,Alabama,33.309742,-86.787222,1,Dawn,0,1,Monday,2


In [4]:
#trim off data
df=pd.read_csv('data/DISTRACT.csv')
df=df[['caseyear','st_case','statename','drdistract','drdistractname']]
df.loc[df['drdistract']==99,'distract_code']=0.2    #code for unknow
df.loc[df['drdistract']==96,'distract_code']=0.5    #code for not reported
df.loc[df['drdistract']==16,'distract_code']=0.7    #code for no driver
df['distract_code'].fillna(df['drdistract'],inplace=True)
#remove duplicate values
df=df.sort_values('distract_code',ascending=False)
df=df.drop_duplicates(subset=['caseyear','st_case'])
df.to_csv('data/DISTRACT.csv',index=False)
df.head()


Unnamed: 0,caseyear,st_case,statename,drdistract,drdistractname,distract_code
294960,2016,130898,Georgia,98,Other Distraction,98.0
303258,2016,240453,Maryland,98,Other Distraction,98.0
58716,2011,130597,Georgia,98,Other Distraction,98.0
308208,2016,300035,Montana,98,Other Distraction,98.0
39262,2010,480374,Texas,98,Other Distraction,98.0


In [5]:
#trim off data
df=pd.read_csv('data/VIOLATION.csv')
#fill NA value
df['violation'].fillna(df['mviolatn'],inplace=True)
df['violationname'].fillna(df['mviolatnname'],inplace=True)
df=df[['caseyear','st_case','statename','violation','violationname']]
df.loc[df['violation']==99,'violation_code']=0.2    #code for unknow
df.loc[df['violation']==97,'violation_code']=0.5    #code for not reported
df.loc[df['violation']==95,'violation_code']=0.7    #code for no driver
df['violation_code'].fillna(df['violation'],inplace=True)
#remove duplicate values
df=df.sort_values('violation_code',ascending=False)
df=df.drop_duplicates(subset=['caseyear','st_case'])
df.to_csv('data/VIOLATION.csv',index=False)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,caseyear,st_case,statename,violation,violationname,violation_code
261832,2015,170914,Illinois,98.0,"Other moving violation (coasting, backing, ope...",98.0
162003,2013,170428,Illinois,98.0,"Other moving violation (coasting, backing, ope...",98.0
429972,2018,170630,Illinois,98.0,"Other moving violation (coasting, backing, ope...",98.0
315308,2016,170055,Illinois,98.0,"Other moving violation (coasting, backing, ope...",98.0
337294,2016,400588,Oklahoma,98.0,"Other moving violation (coasting, backing, ope...",98.0


In [7]:
#trim off data
df=pd.read_csv('data/PERSON.csv')
df=df[['caseyear','st_case','age','sexname','per_typ','per_typname','inj_sev','inj_sevname']]
#only take people inside in-transport vehicle in crash
df=df.loc[(df['per_typname']=='Unknown Occupant Type in a Motor Vehicle In- Transport') |
       (df['per_typname']=='Driver of a Motor Vehicle In-Transport') |
       (df['per_typname']=='Passenger of a Motor Vehicle In-Transport'),:]
#replace the term for readability
df.replace({'Driver of a Motor Vehicle In-Transport':'Driver',
            'Passenger of a Motor Vehicle In-Transport':'Passenger',
            'Unknown Occupant Type in a Motor Vehicle In- Transport':'Unknown'},inplace=True)
df.to_csv('data/PERSON.csv',index=False)

  exec(code_obj, self.user_global_ns, self.user_ns)
