# Carcrash Severity Prediction

Downloading the datasets:

Please run `wget https://data.wprdc.org/dataset/3130f583-9499-472b-bb5a-f63a6ff6059a/resource/ec578660-2d3f-489d-9ba1-af0ebfc3b140/download/all-crashes-2004-2018.csv.zip`
or visit this link to get the Allegheny County crash data.

## Load Data

In [19]:
import ckanapi
from pprint import pprint
import pandas as pd
from scipy import stats
from sklearn.metrics import f1_score
site = "https://data.wprdc.org"

In [20]:
def get_resource_data(site,resource_id,count=50):
    # Use the datastore_search API endpoint to get <count> records from
    # a CKAN resource.
    ckan = ckanapi.RemoteCKAN(site)
    response = ckan.action.datastore_search(id=resource_id, limit=count)

    # A typical response is a dictionary like this
    #{u'_links': {u'next': u'/api/action/datastore_search?offset=3',
    #             u'start': u'/api/action/datastore_search'},
    # u'fields': [{u'id': u'_id', u'type': u'int4'},
    #             {u'id': u'pin', u'type': u'text'},
    #             {u'id': u'number', u'type': u'int4'},
    #             {u'id': u'total_amount', u'type': u'float8'}],
    # u'limit': 3,
    # u'records': [{u'_id': 1,
    #               u'number': 11,
    #               u'pin': u'0001B00010000000',
    #               u'total_amount': 13585.47},
    #              {u'_id': 2,
    #               u'number': 2,
    #               u'pin': u'0001C00058000000',
    #               u'total_amount': 7827.64},
    #              {u'_id': 3,
    #               u'number': 1,
    #               u'pin': u'0001C01661006700',
    #               u'total_amount': 3233.59}],
    # u'resource_id': u'd1e80180-5b2e-4dab-8ec3-be621628649e',
    # u'total': 88232}
    data = response['records']
    return data

In [21]:
crash_data_2017 = get_resource_data(site,resource_id="bf8b3c7e-8d60-40df-9134-21606a451c1a",count=999999999) 

In [22]:
len(crash_data_2017)
df = pd.DataFrame(crash_data_2017)
print(df.head())

   UNBELTED  WORK_ZONE URBAN_RURAL  DRIVER_20YR  MC_DRINKING_DRIVER  \
0         0          0           3            0                   0   
1         0          0           3            0                   0   
2         0          0           3            0                   0   
3         0          0           3            0                   0   
4         0          0           3            0                   0   

   CELL_PHONE  PROPERTY_DAMAGE_ONLY ACCESS_CTRL  HIT_GDRAIL LOCATION_TYPE  \
0           0                     0        None           0            00   
1           0                     1        None           0            00   
2           0                     0        None           0            00   
3           0                     1        None           0            00   
4           0                     0        None           0            00   

   ...  CROSS_MEDIAN  WZ_LN_CLOSURE  LOCAL_ROAD  DRUG_RELATED  PEDESTRIAN  \
0  ...             0           No

## Data Visualization

TODO: Add a graph/animation to show the car crash data
maybe the geospatial data?

## Data Cleanup

In [23]:
import pandas as pd
import wget
from pathlib import Path
import numpy as np

#clean sch_bus_ind y/n to integer 0,1

def type_boolean(c):
    if c == "Y": return 1
    elif c == "N": return 0
    # elif c == "nan": return np.nan
    else:
        return np.nan
    # raise ValueError(c)

def ROAD_CONDITION(c): # 8 is other 9 is unknown, 1,7->2, 3->4, 4->3, 5,6->5, 2,8,9->nan
    if c == 1 or c == 7:
        return 2
    elif c == 3:
        return 4
    elif c == 4:
        return 3
    elif c == 5 or c == 6:
        return 5
    else:
        return np.nan

def INTERSECT_TYPE(c): # 10 is other 99 is unkonw
    if c <= 9:
        return c
    else:
        return np.nan

def ILLUMINATION(c):
    if c <= 6:
        return c
    else:
        return np.nan
    
def WEATHER(c):
    if c <= 7:
        return c
    else:
        return np.nan
    
def TIME(c): # extract only the hour
    if c <= 2500:
        return c // 100
    else:
        return np.nan

if not Path('all-crashes-2004-2018.csv.zip').exists():
    wget.download("https://data.wprdc.org/dataset/3130f583-9499-472b-bb5a-f63a6ff6059a/resource/ec578660-2d3f-489d-9ba1-af0ebfc3b140/download/all-crashes-2004-2018.csv.zip")
# zf = zipfile.ZipFile('all-crashes-2004-2018.csv.zip') 
df_io = pd.read_csv('all-crashes-2004-2018.csv.zip')
# print(df.head())
# print(list(df))
# static_columns = "ROAD_CONDITION,INTERSECT_TYPE,URBAN_RURAL,DISTRICT,STATE_ROAD,LOCAL_ROAD,SNOW_SLUSH_ROAD,LANE_CLOSED,TIME_OF_DAY,SPEED_LIMIT"
# dynamic_columns = "ILLUMINATION,MOTORCYCLE_COUNT,HEAVY_TRUCK_COUNT,WEATHER,HAZARDOUS_TRUCK,SCH_BUS_IND,AUTOMOBILE_COUNT"
# output_columns = "PERSON_COUNT,FATAL_COUNT,INJURY_COUNT,MAX_SEVERITY_LEVEL,MAJOR_INJURY"
# df_io = df[(static_columns+","+dynamic_columns+","+output_columns).split(',')]
# print(df_io.head())
# print(df_io.dtypes)
# print(df_io.info())

# df_io['SCH_BUS_IND'] = df_io['SCH_BUS_IND'].apply(type_boolean)



# df_io['ROAD_CONDITION'] = df_io['ROAD_CONDITION'].apply(ROAD_CONDITION)
# df_io['INTERSECT_TYPE'] = df_io['INTERSECT_TYPE'].apply(INTERSECT_TYPE)
# df_io['ILLUMINATION'] = df_io['ILLUMINATION'].apply(ILLUMINATION)
# df_io['WEATHER'] = df_io['WEATHER'].apply(WEATHER)
# df_io['TIME_OF_DAY'] = df_io['TIME_OF_DAY'].apply(TIME)

# df_io = df_io.astype("Int64")

# print(df_io.head())
# print(df_io.dtypes)
# print(df_io.info())

# drop col that will not be used
static = ['ROAD_CONDITION', 'INTERSECT_TYPE', 'LANE_CLOSED', 'TIME_OF_DAY', 'SPEED_LIMIT', 'ILLUMINATION']
dynamic = ['MOTORCYCLE_COUNT', 'HEAVY_TRUCK_COUNT', 'HAZARDOUS_TRUCK', 'AUTOMOBILE_COUNT', 'SCH_BUS_IND', 'WEATHER']
label = ['PERSON_COUNT', 'FATAL_COUNT', 'INJURY_COUNT', 'MAX_SEVERITY_LEVEL', 'MAJOR_INJURY']
categorical = ['ROAD_CONDITION', 'INTERSECT_TYPE', 'LANE_CLOSED', 'ILLUMINATION', 'HAZARDOUS_TRUCK', 'SCH_BUS_IND', 'WEATHER']
gussian = ['TIME_OF_DAY', 'SPEED_LIMIT', 'MOTORCYCLE_COUNT', 'HEAVY_TRUCK_COUNT', 'AUTOMOBILE_COUNT']
data = static + dynamic
for col in df_io.columns:
    if col not in static and col not in dynamic and col not in label:
        df_io.drop(col, axis = 1, inplace = True)  
# print(df_io[15:25])
# df_io['TIME_OF_DAY'] = df_io['TIME_OF_DAY'].astype("Int64")


# clean data
df_io['SCH_BUS_IND'] = df_io['SCH_BUS_IND'].apply(type_boolean)
df_io['ROAD_CONDITION'] = df_io['ROAD_CONDITION'].apply(ROAD_CONDITION)
df_io['INTERSECT_TYPE'] = df_io['INTERSECT_TYPE'].apply(INTERSECT_TYPE)
df_io['ILLUMINATION'] = df_io['ILLUMINATION'].apply(ILLUMINATION)
df_io['WEATHER'] = df_io['WEATHER'].apply(WEATHER)
df_io['TIME_OF_DAY'] = df_io['TIME_OF_DAY'].apply(TIME)

# drop rows contain nan
df_io = df_io.dropna()
df_io = df_io.astype("int64")
df_io[categorical] = df_io[categorical].astype("object")
print(df_io[15:25])
# group data into dataset/label
df_data = df_io[data].copy()
df_label = df_io[label].copy()
print(df_data.info())
print(df_label.info())

  interactivity=interactivity, compiler=compiler, result=result)


    TIME_OF_DAY ILLUMINATION WEATHER ROAD_CONDITION INTERSECT_TYPE  \
29           11            1       1              5              0   
48           13            1       4              2              0   
52           10            1       2              2              0   
53           15            1       1              2              0   
54           16            1       1              3              0   
59           13            1       2              2              0   
61           11            1       4              5              0   
67           21            2       4              4              2   
69            7            1       4              5              0   
70           10            1       4              3              0   

   SCH_BUS_IND  PERSON_COUNT  AUTOMOBILE_COUNT  MOTORCYCLE_COUNT  \
29           0             2                 2                 0   
48           0             3                 2                 0   
52           0           

## Train Test Split

In [24]:
# split the dataset with 33% as train data
from sklearn.model_selection import train_test_split
comb_df_data = df_io[data].copy()
static_df_data = df_io[static].copy()
dynamic_df_data = df_io[dynamic].copy()

# static
smj_X_train, smj_X_test, smj_y_train, smj_y_test = train_test_split(static_df_data, df_io['MAJOR_INJURY'], test_size=0.33, random_state=42)
sseverity_X_train, sseverity_X_test, sseverity_y_train, sseverity_y_test = train_test_split(static_df_data, df_io['MAX_SEVERITY_LEVEL'], test_size=0.33, random_state=42)

# dynamic
dmj_X_train, dmj_X_test, dmj_y_train, dmj_y_test = train_test_split(dynamic_df_data, df_io['MAJOR_INJURY'], test_size=0.33, random_state=42)
dseverity_X_train, dseverity_X_test, dseverity_y_train, dseverity_y_test = train_test_split(dynamic_df_data, df_io['MAX_SEVERITY_LEVEL'], test_size=0.33, random_state=42)

# combined
cmj_X_train, cmj_X_test, cmj_y_train, cmj_y_test = train_test_split(df_data, df_io['MAJOR_INJURY'], test_size=0.33, random_state=42)
cseverity_X_train, cseverity_X_test, cseverity_y_train, cseverity_y_test = train_test_split(df_data, df_io['MAX_SEVERITY_LEVEL'], test_size=0.33, random_state=42)



In [25]:
class GaussianPredictor:
    
    """ Feature predictor for a normally distributed real-valued, continuous feature.

        attr:
            k : int -- number of classes
            mu : np.ndarray[k] -- vector containing per class mean of the feature
            sigma : np.ndarray[k] -- vector containing per class std. deviation of the feature
    """
    
    def __init__(self, k):
        """ constructor

        args : k -- number of classes
        """
        self.k = k
        self.mu = np.zeros(k)
        self.sigma = np.zeros(k)
        pass

    def fit(self, x, y):
        """update predictor statistics (mu, sigma) for Gaussian distribution

        args:
            x : pd.Series -- feature values
            y : np.Series -- class labels
            
        return : GaussianPredictor -- return self for convenience
        
        """
        # df = pd.DataFrame({"values":x,"labels":y})
        # groups = df.groupby("labels")
        # self.mu = np.array((groups.mean()))
        # self.sigma = np.sqrt(np.array(groups.var(ddof=0)))
        y=np.array(y)
        x=np.array(x)
        
        
        for i in range(self.k):
            ybools = (y==i)
            # given_i = np.array([x[j] for j in range(len(y)) if y[j] == i])
            given_i = np.extract(ybools,x)
            self.mu[i]=given_i.mean()
            self.sigma[i]=np.sqrt(given_i.var())
        return self
            
    def partial_log_likelihood(self, x):
        """ log likelihood of feature values x according to each class

        args:
            x : pd.Series -- feature values

        return: np.ndarray[self.k, len(x)] : log likelihood for this feature for each class
        """
        logpdfs = np.zeros((self.k,len(x)))
        for i in range(self.k):
            logpdfs[i]=stats.norm(loc=self.mu[i],scale=self.sigma[i]).logpdf(x)
        return logpdfs

In [26]:
class CategoricalPredictor:
    """ Feature predictor for a categorical feature.

        attr: 
            k : int -- number of classes
            p : Dict[feature_value, np.ndarray[k]] -- dictionary of vectors containing per-class probability of a feature value;
    """
    
    def __init__(self, k):
        """ constructor

        args : k -- number of classes
        """
        self.k=k

    def fit(self, x, y, alpha=1.):
        """ initializes the predictor statistics (p) for Categorical distribution
        
        args:
            x : pd.Series -- feature values
            y : pd.Series -- class labels
        
        kwargs:
            alpha : float -- smoothing factor

        return : CategoricalPredictor -- returns self for convenience:
        """
        y=np.array(y)
        x=np.array(x)
        self.p = {}
        for char in set(x):
#             print(self.k)
#             print(char)
#             print(self.p)
            self.p[char]=np.zeros(self.k)
            
        
        for i in range(self.k):
            # n=sum([1 for j in range(len(y)) if y[j] == i])
            ybools= (y==i)
            
            n=np.sum( ybools )
            for char in set(x):
                xbools= (x == char)
                # nj=sum([1 for j in range(len(y)) if x[j] == char and y[j] == i ])
                nj = np.sum(np.logical_and(xbools,ybools))
                (self.p[char])[i] = (nj + alpha)/ (n+len(set(x))*alpha)
        return self

    def partial_log_likelihood(self, x):
        """ log likelihood of feature values x according to each class

        args:
            x : pd.Series -- vector of feature values

        return : np.ndarray[self.k, len(x)] -- matrix of log likelihood for this feature
        """
        like = np.zeros((self.k,len(x)))
        
        for i in range(self.k):
            for j,char in enumerate(x):
                like[i][j]=np.log(self.p[char][i])
        return like

In [27]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
# not wokring properly
class NaiveBayesClassifier:
    def __init__(self, df_feature, df_label, alpha=1.):
        self.predictor = {}
        self.l = df_label.value_counts()
        self.unique_label = len(df_label.unique())
        self.log_prior = np.zeros(self.unique_label)
        n = len(df_label)
        print(n, self.unique_label)
        # Initialize log_prior
        for i in range(self.unique_label):
            ybools= (label==i)
            nt=np.sum( ybools )
            self.log_prior[i] = np.log((nt+alpha)/(n+(self.unique_label*alpha)))
            
        # Initialize predictor
        for col in df_feature.columns:
            if df_feature[col].dtype == 'object':
#                 print(df_feature[col].shape)
#                 print(df_feature[col].values.reshape(-1,1))
#                 self.predictor[col] = CategoricalPredictor(self.unique_label).fit(df_feature[col], df_label)
                self.predictor[col] = MultinomialNB().fit(df_feature[col].values.reshape(-1,1), df_label)
            elif df_feature[col].dtype == 'int64':
#                 self.predictor[col] = GaussianPredictor(self.unique_label).fit(df_feature[col], df_label)
                self.predictor[col] = GaussianNB().fit(df_feature[col].values.reshape(-1,1), df_label)
        
    def log_likelihood(self, df_feature):
        likelihood = np.array(([self.log_prior,]*len(df_feature)))
        for col in self.predictor:
            model = self.predictor[col]
#             print(df_feature[col].values.reshape(-1,1))
#             print(model.predict_log_proba(df_feature[col].values.reshape(-1,1)))
#             print(collections.Counter(df_feature[col].values.reshape(-1,1)))
#             unique, counts = np.unique(df_feature[col].values.reshape(-1,1), return_counts=True)
#             p = dict(zip(unique, counts))
#             print (p)

            z = model.predict_log_proba(df_feature[col].values.reshape(-1,1))
            likelihood += z
#             likelihood += model.partial_log_likelihood(df_feature[col])
#             print(col)
#             for i in range(z.shape[0]):
#                 print(collections.Counter(z[i]))
#         print(likelihood.shape)
        return likelihood 
        
    def predict(self, df_feature):
        pred = np.argmax(self.log_likelihood(df_feature),axis = 1)
        return pred
        
    

In [28]:
# working 
class NaiveBayesClassifier3:
    """ Naive Bayes classifier for a mixture of continuous and categorical attributes.
        We use GaussianPredictor for continuous attributes and CategoricalPredictor for categorical ones.
        
        attr:
            predictor : Dict[column_name,model] -- model for each column
            log_prior : np.ndarray -- the (log) prior probability of each class
    """

    def __init__(self, df, label, alpha=1.):
        """initialize predictors for each feature and compute class prior
        
        args:
            df : pd.DataFrame -- processed dataframe, without any missing values.
        
        kwargs:
            alpha : float -- smoothing factor for prior probability
        """
        # label = df["label"]
        self.predictor = {}
        self.l = label.value_counts()
        self.unique_label = len(label.unique())
        print(self.unique_label)
        self.log_prior = np.zeros(self.unique_label)
        n = len(label.unique())
        print(n)
        # Initialize log_prior
        for i in range(self.unique_label):
            ybools= (label==i)
            nt=np.sum( ybools )
            self.log_prior[i] = np.log( (nt+alpha)/(n+(self.unique_label*alpha)))
        types = dict(df.dtypes)
        for key in types:
            if key != "label" and key!= "index":
                if str(types[key])=="int64":
                    self.predictor[key]=GaussianPredictor(n).fit(df[key],label)
                elif str(types[key])=="object":
                    self.predictor[key]=CategoricalPredictor(n).fit(df[key],label)
                else:
                    raise TypeError
                                

    def log_likelihood(self, x):
        """log_likelihood for input instances from log_prior and partial_log_likelihood of feature predictors

        args:
            x : pd.DataFrame -- processed dataframe (ignore label if present)

        returns : np.ndarray[num_classes, len(x)] -- array of log-likelihood
        """

        like = np.array(([self.log_prior,]*len(x))).transpose()

        for key in self.predictor:
            model = self.predictor[key]
            z= model.partial_log_likelihood(x[key])
            like +=z
            print(key)
            for i in range(z.shape[0]):
                print(collections.Counter(z[i]))
                             
        return like           

    def predict(self, x):
        """predicts label for input instances, breaks ties in favor of the class with lower id.

        args:
            x : pd.DataFrame -- processed dataframe (ignore label if present)

        returns : np.ndarray[len(x)] -- vector of class labels
        """
        pred = np.argmax(self.log_likelihood(x),axis = 0)
        return pred
# working
class NaiveBayesClassifier2:
    """ Naive Bayes classifier for a mixture of continuous and categorical attributes.
        We use GaussianPredictor for continuous attributes and CategoricalPredictor for categorical ones.
        
        attr:
            predictor : Dict[column_name,model] -- model for each column
            log_prior : np.ndarray -- the (log) prior probability of each class
    """

    def __init__(self, df, label, alpha=1.):
        """initialize predictors for each feature and compute class prior
        
        args:
            df : pd.DataFrame -- processed dataframe, without any missing values.
        
        kwargs:
            alpha : float -- smoothing factor for prior probability
        """
        # label = df["label"]
        k = max(label)+1
        self.log_prior = np.zeros(k)
        n = len(label)
        for i in range(k):
            ybools= (label==i)          
            nt=np.sum( ybools )
            self.log_prior[i] = np.log( (nt+alpha)/(n+(k*alpha)))
        
        self.predictor = dict()
        types = dict(df.dtypes)
        for key in types:
            if key != "label" and key!= "index":
                if str(types[key])=="int64":
                    self.predictor[key]=GaussianPredictor(k).fit(df[key],label)
                elif str(types[key])=="object":
                    self.predictor[key]=CategoricalPredictor(k).fit(df[key],label)
                else:
                    raise TypeError
                    
        self.k=k            
        pass

    def log_likelihood(self, x):
        """log_likelihood for input instances from log_prior and partial_log_likelihood of feature predictors

        args:
            x : pd.DataFrame -- processed dataframe (ignore label if present)

        returns : np.ndarray[num_classes, len(x)] -- array of log-likelihood
        """
        # try:
        #     x = x.drop("label")
        # except:
        #     pass
        
        # like = np.zeros((self.k, len(x)))
        like = np.array(([self.log_prior,]*len(x))).transpose()
        # print (self.k,len(x))
        # print(like.shape)
        # for i in range(self.k):
        #     for j in range(len(x)):
        #         
        #         # like[i][j] = self.log_prior[i]+np.sum( self.predictor[key].partial_log_likelihood(x[j])[i][j] for key in self.predictor )
        #         like[i][j]=self.log_prior[i]
        for key in self.predictor:
            model = self.predictor[key]
            z= model.partial_log_likelihood(x[key])
            like +=z
#             print(key)
#             for i in range(z.shape[0]):
#                 print(collections.Counter(z[i]))
            # like[i][j] += model.partial_log_likelihood(x[key])[i][j]
                             
        return like           

    def predict(self, x):
        """predicts label for input instances, breaks ties in favor of the class with lower id.

        args:
            x : pd.DataFrame -- processed dataframe (ignore label if present)

        returns : np.ndarray[len(x)] -- vector of class labels
        """
        pred = np.argmax(self.log_likelihood(x),axis = 0)
        return pred


## Label normalization

In [29]:
import collections
def label_normalization(label,label_test):
    # print(label)
    labellist = pd.Series(np.append(label,label_test)).unique()
#     print(labellist)
    labellist.sort()
    revdic = dict()
    for i,z in enumerate(labellist):
        revdic[z] = i
#     print (revdic)
    f = (lambda s: revdic[s])
    return np.array(list(map(f, label))),np.array(list(map(f, label_test)))

## Accuracy estimate

In [36]:
def cal_accuracy(y_pred, y_test):
    correct_counter = 0
    total = len(y_test)
    if total != len(y_pred):
        print("param size not match.")
    for idx in range(total):
        if (int)(y_pred[idx]) == (int)(y_test[idx]):
            correct_counter += 1
    print(correct_counter, total)
    return correct_counter / total

def f1(pred, ground):
    """ evaluates a classifier based on a supplied validation data

    args:
        pred: numpy.ndarray(bool) -- predictions
        ground: numpy.ndarray(bool) -- known ground-truth values
    
    return : double -- the F1 score of the predictions
    """
    pred = np.array(pred, dtype=bool)
    ground = np.array(ground, dtype=bool)
    realT = sum(ground)
    predT = sum(pred)
    bothT = float(sum([1 for i in range(ground.shape[0]) if pred[i] and ground[i]]))
    precision = bothT/predT
    recall = bothT/realT
    print("precision and recal in f1:",precision, recall)
    return 2*(precision*recall)/(precision+recall)

## Environment Static Analysis


In [31]:
# Normalize the labels
ss_train,ss_test = label_normalization(sseverity_y_train,sseverity_y_test)
smj_train, smj_test = label_normalization(smj_y_train,smj_y_test)

# Severity level
ss_classifier = NaiveBayesClassifier2(sseverity_X_train, pd.Series(ss_train))
ss_predict = ss_classifier.predict(sseverity_X_test)
c = collections.Counter(ss_predict)
d = collections.Counter(ss_test)
print("Accuracy: ", cal_accuracy(ss_predict, ss_test))

# Major injury
smj_classifier = NaiveBayesClassifier2(smj_X_train, pd.Series(smj_y_train))
smj_predict = smj_classifier.predict(smj_X_test)
c = collections.Counter(smj_predict)
d = collections.Counter(smj_test)
print("Accuracy: ", cal_accuracy(smj_predict, smj_test))



# d = collections.Counter(sseverity_y_test)
print(c)
print(d)
# print(sseverity_y_test)
# print(label_normalization(sseverity_y_test))

8118 14392
Accuracy:  0.5640633685380767
14209 14392
Accuracy:  0.9872846025569761
Counter({0: 14392})
Counter({0: 14209, 1: 183})


## Dynamic Analysis

In [39]:
# Normalize the labels
ds_train, ds_test = label_normalization(dseverity_y_train,dseverity_y_test)
dmj_train, dmj_test = label_normalization(dmj_y_train,dmj_y_test)

# Severity level
ds_classifier = NaiveBayesClassifier2(dseverity_X_train, pd.Series(ds_train))
ds_predict = ds_classifier.predict(dseverity_X_test)
c = collections.Counter(ds_predict)
d = collections.Counter(ds_test)
print(c)
print(d)
print("Accuracy: ", cal_accuracy(ds_predict, ds_test))

# Major injury
dmj_classifier = NaiveBayesClassifier2(dmj_X_train, pd.Series(dmj_y_train))
dmj_predict = dmj_classifier.predict(dmj_X_test)
c = collections.Counter(dmj_predict)
d = collections.Counter(dmj_test)
print(c)
print(d)
print("Accuracy: ", cal_accuracy(dmj_predict, dmj_test))
print("F1: ",f1(dmj_predict,dmj_test))
print("F1': ",f1_score(dmj_predict,dmj_test))

dumj_predict = np.zeros(dmj_test.shape)
print("predict all zeros Accuracy: ", cal_accuracy(dumj_predict, dmj_test))


Counter({0: 13845, 1: 532, 4: 15})
Counter({0: 8121, 4: 2625, 5: 1778, 3: 1155, 6: 478, 2: 172, 1: 63})
7871 14392
Accuracy:  0.5469010561423013
Counter({0: 14315, 1: 77})
Counter({0: 14209, 1: 183})
14136 14392
Accuracy:  0.9822123401889938
precision and recal in f1: 0.025974025974025976 0.01092896174863388
F1:  0.015384615384615385
F1':  0.015384615384615385
14209 14392
Accuracy:  0.9872846025569761
precision and recal in f1: nan 0.0
F1:  nan
F1':  0.0


  'recall', 'true', average, warn_for)


## Combined Analysis

In [40]:
# Normalize the labels
cs_train, cs_test = label_normalization(cseverity_y_train, cseverity_y_test)
cmj_train, cmj_test = label_normalization(cmj_y_train, cmj_y_test)

# Severity level
cs_classifier = NaiveBayesClassifier2(cseverity_X_train, pd.Series(cs_train))
cs_predict = cs_classifier.predict(cseverity_X_test)
c = collections.Counter(cs_predict)
d = collections.Counter(cs_test)
print(c)
print(d)
print("Accuracy: ", cal_accuracy(cs_predict, cs_test))

# Major injury
cmj_classifier = NaiveBayesClassifier2(cmj_X_train, pd.Series(cmj_y_train))
cmj_predict = cmj_classifier.predict(cmj_X_test)
c = collections.Counter(cmj_predict)
d = collections.Counter(cmj_test)
print(c)
print(d)
print("Accuracy: ", cal_accuracy(cmj_predict, cmj_test))
print("F1: ",f1(cmj_predict,cmj_test))

dumj_predict = np.zeros(dmj_test.shape)
print("predict all 0s Accuracy: ", cal_accuracy(dumj_predict, cmj_test))

Counter({0: 13817, 1: 526, 4: 46, 3: 2, 5: 1})
Counter({0: 8121, 4: 2625, 5: 1778, 3: 1155, 6: 478, 2: 172, 1: 63})
7867 14392
Accuracy:  0.5466231239577543
Counter({0: 14257, 1: 135})
Counter({0: 14209, 1: 183})
14082 14392
Accuracy:  0.9784602556976097
precision and recal in f1: 0.02962962962962963 0.02185792349726776
F1:  0.02515723270440252
14209 14392
Accuracy:  0.9872846025569761


## Real-Time Severity Prediction