In [21]:
import pandas as pd
import numpy as np
import sklearn
import panel as pn
import re
from bokeh.plotting import figure, show
from bokeh.models import BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter
pn.extension(comms='vscode')

In [22]:
#import the data dictionary using a custom module
import crash_dictionary
crash_info = crash_dictionary.CrashDict('./data/crash_dictionary.csv')
display(crash_info.get_labels(['gcs','source']))
display(crash_info.get_details('cause'))
crash_info.get_df()

Unnamed: 0_level_0,Labels
Name,Unnamed: 1_level_1
gcs,Glasgow Coma Score Total
source,Method of Transmission of Entry Form to CC


Unnamed: 0_level_0,Labels,Units,Levels,Class,Storage,NAs
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cause,Main Cause of Death,,7.0,,integer,17118


Unnamed: 0_level_0,Labels,Units,Levels,Class,Storage,NAs
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
entryid,Unique Numbers for Entry Forms,,,integer,integer,0
source,Method of Transmission of Entry Form to CC,,5.0,,integer,0
trandomised,Date of Randomization,,,Date,double,0
outcomeid,Unique Number From Outcome Database,,,integer,integer,80
sex,,,2.0,,integer,1
age,,,,,integer,4
injurytime,Hours Since Injury,,,numeric,double,11
injurytype,,,3.0,,integer,0
sbp,Systolic Blood Pressure,mmHg,,integer,integer,320
rr,Respiratory Rate,/min,,integer,integer,191


## Cleaning happens elsewhere!
Check to see if the loaded dataframe has any missing values. If so, go back to the cleanup notebook, deal with the missing values, and export a clean csv to load.

In [34]:
def check_na(my_df):
    for non_na_count in my_df.count():
        if non_na_count != len(my_df):
            raise ValueError("No NA values are allowed! Clean the data first.")
    print("No NA values found. This df is ready for analysis!")


In [35]:
## test check_na()
try:
    test_df = pd.read_stata('./data/crash2.dta')
    check_na(test_df)
    print("If this line is visible, the test fails!")
except Exception as e:
    print("If the errow below tells you to go clean the data, this test passes!")
    print(f">>> Error: {e}")
finally:
    del test_df

If the errow below tells you to go clean the data, this test passes!
>>> Error: No NA values are allowed! Clean the data first.


## Load the data

In [39]:
## Load data
cleaned_csv_path = "./cleaned_data/cleaned_crash_data.csv"
crash_df = pd.read_csv(cleaned_csv_path)
try: 
    check_na(crash_df)
    display(crash_df.head(3))
    display(crash_df.describe())
except Exception as e:
    print(e)

No NA values found. This df is ready for analysis!


Unnamed: 0.1,Unnamed: 0,entryid,source,trandomised,outcomeid,sex,age,injurytime,injurytype,sbp,...,bstroke,bbleed,bmi,bgi,bloading,bmaint,btransf,bvii,boxid,packnum
0,1,2,electronic CRF by email,16585.0,190.0,female,27.0,1.0,blunt,100.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2001,28
1,2,3,electronic CRF by email,16581.0,4.0,male,30.0,1.0,blunt,70.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2011,21
2,4,5,electronic CRF by email,16585.0,1154.0,female,23.0,4.0,penetrating,80.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2011,23


Unnamed: 0.1,Unnamed: 0,entryid,trandomised,outcomeid,age,injurytime,sbp,rr,cc,hr,...,bstroke,bbleed,bmi,bgi,bloading,bmaint,btransf,bvii,boxid,packnum
count,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,...,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0,14013.0
mean,10282.924499,10312.232855,17640.933348,10236.617427,34.196104,2.91411,100.782559,22.807964,3.106758,103.821666,...,0.004282,0.074003,0.00207,0.009919,0.997574,0.986013,0.48034,0.001784,5169.218511,51.606437
std,5843.879062,5864.734981,413.293738,5818.172525,13.970844,2.281599,24.220707,6.184919,1.554011,20.040477,...,0.065297,0.261785,0.045446,0.099104,0.0492,0.117441,0.499631,0.042202,2503.796064,25.078755
min,1.0,2.0,16575.0,4.0,14.0,0.1,10.0,1.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2001.0,21.0
25%,5159.0,5166.0,17343.0,5125.0,23.0,1.0,86.0,19.0,2.0,90.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2760.0,27.0
50%,10515.0,10543.0,17715.0,10342.0,30.0,2.0,100.0,22.0,3.0,104.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,4507.0,45.0
75%,15354.0,15403.0,17975.0,15266.0,42.0,4.0,112.0,26.0,4.0,120.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,8241.0,83.0
max,20206.0,20270.0,18285.0,20199.0,99.0,72.0,250.0,96.0,30.0,220.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9065.0,98.0


## TODO: PCA_ANALYSIS