In [6]:
import pandas as pd
import numpy as np
from numpy.random import randn

In [13]:
ts = pd.Series(randn(10))
ts[2:-2] = np.nan
sts = ts.fillna(0).to_sparse(fill_value=0)
sts.density


0.4

0   -1.562886
1   -1.796891
2    0.000000
3    0.000000
4    0.000000
5    0.000000
6    0.000000
7    0.000000
8    1.812606
9   -0.309717
dtype: float64
BlockIndex
Block locations: array([0, 8])
Block lengths: array([2, 2])

Import data and modules
-----

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
%matplotlib inline


# load the dataset
data = pd.read_csv('LargeDatabase.txt', engine = 'python', sep='\t', index_col='Obs.')
data = data.drop('Unnamed: 94', 1)
#data.columns = ['state','caseNum','vehicleNum','personNum',
#              'atmCond1','atmCond2','county',
#              'crashMonth','dayOfWeek','firstHarmfulEvent',
#              'numFatalities','age','alcohol','drug']

In [None]:
# Select interesting stuff
# --------------

# Load Code manual
import pickle
code_manual = pickle.load( open( "CodeManual.p", "rb" ) )

# Select interesting features and map them to their respective
#   names in the code manual
translate_column_names = {
'driverdrowsy' : 'Drowsy driver',
'ptype' : 'Person Type',
'druginv' : 'Police Reported Drug Involvement',
'heavytruck': 'Large Truck Related',
'schlbus' : 'School Bus Related',
'sex' : 'Sex',
'race' : 'Race',
'reljuncinter' : 'Relation To Junction: Within Interchange Area',
'atmcond': 'Atmospheric Condition (1)',
'holiday' : 'Holiday Related',
'nhs' : 'National Highway System',
'hispanic' : 'Hispanic',
'rfun' : 'Roadway Function Class',
'lightcond' : 'Light Condition',
'speeding' : 'Speeding',
'dayofweek' : 'Day Of Week',
}

also_interesting = ['age', 'alcres']

labelsOfInter = list(translate_column_names.keys())
labelsOfInter.extend(also_interesting)

data = data[labelsOfInter].applymap(lambda x: -1 if x == '.' else int(x))

Reduce dimensionality
-----------

In [None]:
# Collapse some features values
# ------------

# Categorize on number of Fatalities
def NumFatalitiesToCategory(num):
    if num <3:
        return 1
    elif (num>=3) and (num<6):
        return 2
    return 3

# Reduce categories in 'Light Condition'
code_manual[translate_column_names['lightcond']][11] = 'Dawn/Dusk'
code_manual[translate_column_names['lightcond']][12] = 'Dark'
def Collaps_lightcond(num):
    if num == 4 or num == 5:
        return 11
    elif num in [6,8,9]: #unknown
        return -1
    return num

# Reduce categories in "Roadway Function Class"
code_manual[translate_column_names['rfun']][21] = 'Rural'
code_manual[translate_column_names['rfun']][22] = 'Urban'
def Collaps_rfun(num):
    if num in range(1,10):
        return 21
    elif num in range(11,20):
        return 22
    elif num == 99: #unknown
        return -1
    return num


# Reduce categories in "Holiday Related"
code_manual[translate_column_names['holiday']][-1] = 'Not Holiday or Unknow date'
code_manual[translate_column_names['holiday']][1] = 'Was Holiday'
def Collaps_holiday(num):
    if num >0:
        return 1
    return num


# Reduce categories in "Atmospheric Condition (1)"
code_manual[translate_column_names['atmcond']][1] = 'Clear or Cloudy'
code_manual[translate_column_names['atmcond']][2] = 'Precipitation'
code_manual[translate_column_names['atmcond']][4] = 'Snow'
code_manual[translate_column_names['atmcond']][6] = "Severe Crosswinds, Blowing Sand, Soil, Dirt",
def Collaps_atmcond(num):
    if num in [8, 0, 98, 99]: #other or not known
        return -1
    elif num in [1,10]:
        return 1
    elif num in [2,12,3]:
        return 2
    elif num in [6,7]:
        return 6
    return num


# Reduce categories in "Person Type"
code_manual[translate_column_names['ptype']][1] = 'In a Motor vehicle'
code_manual[translate_column_names['ptype']][4] = 'Non-Motor vehicle'
def Collaps_ptype(num):
    if num in [-1,19]: #other or not known
        return -1
    elif num in [1,2,9,3]:
        return 1
    elif num in [4,8,6,7]:
        return 4
    return num

# Reduce categories in "Hispanic"
code_manual[translate_column_names['hispanic']][10] = 'Is Hispanic'
def Collaps_hispanic(num):
    if num in [-1,99,0]: #other or not known
        return -1
    elif num in range(1,7):
        return 10
    return num


# Reduce categories in "Race"
code_manual[translate_column_names['race']][30] = 'American Indian or Hawaiian'
code_manual[translate_column_names['race']][31] = 'Asian (not Indian)'
code_manual[translate_column_names['race']][32] = 'Indian'
def Collaps_race(num):
    if num in [-1, 0, 98, 99, 97]: #other or not known
        return -1
    elif num in [3,6]:  
        return 30
    elif num in [4,5,7,28,38,48,58,68,78]:  
        return 31
    elif num in [18,19]:  
        return 32
    return num


In [None]:
# apply reduction of categories cardinality
data.lightcond = data.lightcond.apply(Collaps_lightcond )
data.rfun      = data.rfun.apply(Collaps_rfun  )
data.holiday   = data.holiday.apply(Collaps_holiday )
data.atmcond   = data.atmcond.apply(Collaps_atmcond )
data.ptype     = data.ptype.apply(Collaps_ptype )
data.hispanic  = data.hispanic.apply(Collaps_hispanic )
data.race      = data.race.apply(Collaps_race)

In [114]:
# Map different types of 'unknown' to a single number
unknown = {'druginv': [8,9],
           'nhs'    : [9],
           'reljuncinter': [8,9],
           'dayofweek': [-1,9],
           'sex'     : [8,9],
           'age'     : [-1, 998, 999],
           'alcres'  : [95, 96, 97, 98, 99],
           'druginv' : [-1, 8, 9]
          }

# collaps all keys for 'unknown' into '-1' value
for feature in unknown.keys():
    data[feature] = data[feature].apply(lambda x: x if x not in unknown[feature] else -1)

One-Hot encoder where needed
-------------

In [135]:
def categorize_a_label(dataset, labl_to_categ = 'drug'):
    '''
    Use One-hot encoder on a categorical feature,
    join it back to the original dataframe
    and return the new dataframe
    '''
    tt = dataset[labl_to_categ]
    tt_df = pd.get_dummies(tt)
    tt_df.columns = [ '%s:%d'%(labl_to_categ, value) for value in tt_df.keys() ]
    dataset = dataset.join(tt_df)
    dataset = dataset.drop(labl_to_categ,1)
    return dataset

categorical_features_list = ['lightcond', 'schlbus', 'druginv', 'nhs', 'speeding', 'rfun', 'holiday',
                           'atmcond', 'heavytruck', 'reljuncinter', 'ptype', 'dayofweek',
                           'hispanic', 'race', 'driverdrowsy']
non_categ_features_list = [ 'sex', 'age', 'alcres' ]

target_name = 'dayofweek'

In [237]:
#clean = data.copy()
for feature in data.keys():
    if feature == target_name:
        continue
    elif feature in categorical_features_list:
        data = categorize_a_label(data, labl_to_categ = feature)
    elif feature in non_categ_features_list:
        continue
    else:
        print("!!! Don't know whether", feature, 'is categorical or not')
        break

Select n features, analyze and return score
-----

In [238]:
def run_analysis(clean_dataset, target_name):
    # Separate into train_features, train_label and test_feature, test_label
    threshold = int(round(len(clean_dataset)*.70))
    train_features  = clean_dataset[:threshold].drop(target_name,1)
    test_feature    = clean_dataset[threshold:].drop(target_name,1)
    train_label     = clean_dataset[target_name][:threshold]
    test_label      = clean_dataset[target_name][threshold:]
    
    # Train decision tree
    from sklearn.tree import DecisionTreeClassifier
    clf = DecisionTreeClassifier(max_depth = 5)
    clf.fit(train_features, train_label)
    predict = clf.predict(test_feature)
     
    return clf.score(test_feature, test_label)

In [251]:
from itertools import combinations as comb
output = {}
target_name = 'dayofweek'

In [253]:
ww0 = ''
ww = ''
for sel_feat in comb(labelsOfInter, 3):
    features = list(sel_feat)
   
    if target_name in features:
        features = features.remove(target_name)
        continue
    if str(features) in output.keys():
        continue
        
    #is it a new word?
    if features[0] != ww:
        print('\n--', features[0])
        ww = features[0]
    try:
        if features[1] != ww0:
            print(features[1], end=", ")
            ww0 = features[1]      
    except IndexError:
        pass
    
    # There are many keys in the dataset for one feature
    # because we passed the one-hot encoder.
    #  we want to select all keys corresponding to
    #  the given feature
    sel_keys = []
    for feat in features:
        for key in data.keys():
            if key.startswith(feat):
                sel_keys.append(key)
    sel_keys.append(target_name)
    clean = data[sel_keys]#.copy()

    output[str(features)] = run_analysis(clean, target_name)


-- lightcond
schlbus, druginv, nhs, speeding, rfun, holiday, atmcond, heavytruck, reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- schlbus
druginv, nhs, speeding, rfun, holiday, atmcond, heavytruck, reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- druginv
nhs, speeding, rfun, holiday, atmcond, heavytruck, reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- nhs
speeding, rfun, holiday, atmcond, heavytruck, reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- speeding
rfun, holiday, atmcond, heavytruck, reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- rfun
holiday, atmcond, heavytruck, reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- holiday
atmcond, heavytruck, reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- atmcond
heavytruck, reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- heavytruck
reljuncinter, ptype, hispanic, race, driverdrowsy, sex, age, 
-- reljuncinter
pt

In [254]:
pp = pd.Series(output)
pp.sort_values(ascending=False)[:10]

['lightcond', 'druginv', 'heavytruck']       0.199625
['lightcond', 'heavytruck', 'alcres']        0.199534
['schlbus', 'speeding', 'heavytruck']        0.199168
['schlbus', 'holiday', 'heavytruck']         0.199168
['schlbus', 'heavytruck', 'reljuncinter']    0.199030
['heavytruck', 'sex', 'age']                 0.198893
['speeding', 'rfun', 'heavytruck']           0.198848
['schlbus', 'heavytruck', 'sex']             0.198802
['rfun', 'heavytruck', 'race']               0.198802
['schlbus', 'heavytruck']                    0.198802
dtype: float64

Export the tree
---------

In [142]:
def rules(clf, features, labels, node_index=0, simple_leaf_string=True):
    """Structure of rules in a fit decision tree classifier

    Parameters
    ----------
    clf : DecisionTreeClassifier
        A tree that has already been fit.

    features, labels : lists of str
        The names of the features and labels, respectively.

    """
    node = {}
    if clf.tree_.children_left[node_index] == -1:  # indicates leaf
        count_labels = zip(clf.tree_.value[node_index, 0], labels)
        node['name'] = ', '.join(('{} of {}'.format(int(count), label)
                                  for count, label in count_labels))
        if simple_leaf_string:
            node['name'] = simplify_string(node['name'])
    else:
        feature = features[clf.tree_.feature[node_index]]
        threshold = clf.tree_.threshold[node_index]
        node['name'] = '{} > {}'.format(feature, threshold)
        left_index = clf.tree_.children_left[node_index]
        right_index = clf.tree_.children_right[node_index]
        node['children'] = [rules(clf, features, labels, right_index, simple_leaf_string),
                            rules(clf, features, labels, left_index, simple_leaf_string)]
    return node

def simplify_string(st):
    #st = '772 of Sun, 838 of Mon, 862 of Tue, 899 of Wed, 919 of Thu, 1024 of Fri, 994 of Sat'
    st = st.split(', ')
    nums = [ int(entry.split()[0]) for entry in st ]
    pred_day = label_dictionary[nums.index(max(nums))]
    return pred_day

In [144]:
set(clean.dayofweek)

{1, 2, 3, 4, 5, 6, 7}

In [145]:
import json
label_dictionary = 'Sun Mon Tue Wed Thu Fri Sat'.split()
r = rules(clf, train_features.keys(), label_dictionary, simple_leaf_string=True)
with open('rules.json', 'w') as f:
    f.write(json.dumps(r))

In [146]:
train_features.head()

Unnamed: 0_level_0,alcres,age,druginv:-1,druginv:0,druginv:1,atmcond:-1,atmcond:1,atmcond:2,atmcond:4,atmcond:5,atmcond:6,atmcond:11
Obs.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,26,24,1,0,0,0,1,0,0,0,0,0
2,-1,30,1,0,0,0,1,0,0,0,0,0
3,31,52,0,1,0,0,1,0,0,0,0,0
4,-1,17,1,0,0,0,1,0,0,0,0,0
5,-1,18,1,0,0,0,1,0,0,0,0,0
