# Firefighters

## Load the data

In [275]:
import numpy as np
import pandas as pd

data = pd.read_csv('../data/Firefighters.csv')  # load data set
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 5 columns):
Race        118 non-null object
Position    118 non-null object
Oral        118 non-null float64
Written     118 non-null int64
Combine     118 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 4.7+ KB


## Test passers, top scorers

Originally there were 8 Lieutenant and 7 Captain positions and within two years total of 16 Lieutenant and 8 Captain positions.

In [276]:
def is_top_scoring(x,thresh_ltn,thresh_cpt):
    return 1 if ((x[0]=='Captain' and x[1] >= thresh_cpt) or (x[0]=='Lieutenant' and x[1] >= thresh_ltn)) else 0

def get_top_scoring(n_ltn, n_cpt):
    thresh_ltn = min(data.groupby('Position')['Combine'].nlargest(n_ltn+2)['Captain'])
    thresh_cpt = min(data.groupby('Position')['Combine'].nlargest(n_cpt+2)['Lieutenant'])
    return data[['Position','Combine']].apply(is_top_scoring, args=(thresh_ltn,thresh_cpt), axis=1)

passed = data['Combine'].apply(lambda x: int(x >= 70)).rename('Passed')
top_scoring = get_top_scoring(8, 7).rename('Top_k')
top_scoring2 = get_top_scoring(16, 8).rename('Top_k_2y')
data = pd.concat([top_scoring, top_scoring2, passed, data], axis=1)

data.head()

Unnamed: 0,Top_k,Top_k_2y,Passed,Race,Position,Oral,Written,Combine
0,1,1,1,W,Captain,89.52,95,92.808
1,1,1,1,W,Captain,80.0,95,89.0
2,1,1,1,W,Captain,82.38,87,85.152
3,0,0,1,W,Captain,88.57,76,81.028
4,0,0,1,W,Captain,76.19,84,80.876


## Logistic regression


In [277]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

features = data.columns.drop(['Race', 'Position', 'Top_k', 'Top_k_2y', 'Passed']).values
targets = ['Top_k', 'Top_k_2y', 'Passed']

# One-Hot encoding for categorical data
for key in ['Position', 'Race']:
    dummies = pd.get_dummies(data[key]).rename(columns=lambda x: key+'_' + str(x))
    data = pd.concat([data, dummies], axis=1)

# Training loop
predicted_dict = {}
for pos in ['Lieutenant','Captain']:
    data2 = data[data['Position'] == pos]
    X, X_test, y, y_test = train_test_split(data2.iloc[:,1:], data2[targets], test_size=0.4, random_state=12345)
    predicted = pd.concat([X_test['Race'], X_test['Position']], axis=1)
    for key in targets:
        # Train
        lr = LogisticRegression(solver='lbfgs', max_iter=500) 
        lr.fit(X[features], y[key])
        y_pred = lr.predict(X_test[features])
        # Results
        predicted[key] = y_pred
    predicted_dict[pos] = predicted.copy()

In [278]:
predicted_dict['Lieutenant'].head()

Unnamed: 0,Race,Position,Top_k,Top_k_2y,Passed
45,W,Lieutenant,1,1,1
101,W,Lieutenant,0,0,0
61,W,Lieutenant,0,0,1
114,B,Lieutenant,0,0,0
111,W,Lieutenant,0,0,0


In [279]:
predicted_dict['Captain'].head()

Unnamed: 0,Race,Position,Top_k,Top_k_2y,Passed
32,H,Captain,0,0,0
19,W,Captain,0,0,1
2,W,Captain,1,1,1
21,B,Captain,0,0,0
5,H,Captain,0,0,1


## Adverse impact ratio for predicted data


In [280]:
def get_summary(key):
    passed = predicted_dict[key].groupby(['Race','Passed']).size().unstack().fillna(0)
    top_k = predicted_dict[key].groupby(['Race','Top_k']).size().unstack().fillna(0)
    top_k_2y = predicted_dict[key].groupby(['Race','Top_k_2y']).size().unstack().fillna(0)
    summary =  pd.DataFrame({
        'Race': ['B','H','W'], 
        'Pass': [passed[1]['B'],passed[1]['H'],passed[1]['W']],
        'Pass_pr': [0,0,0],
        'Fail': [passed[0]['B'],passed[0]['H'],passed[0]['W']],
        'Total': [0,0,0],
        'Top_k': [top_k[1]['B'],top_k[1]['H'],top_k[1]['W']],
        'Top_k_2y': [top_k_2y[1]['B'],top_k_2y[1]['H'],top_k_2y[1]['W']],
    })
    summary['Total'] = summary['Pass'] + summary['Fail']
    summary['Pass_pr'] = 100*summary['Pass']/summary['Total']
    summary.set_index(['Race'], inplace=True)
    return summary

#### Lieutenant

In [281]:
summary = get_summary('Lieutenant')
summary

      Pass    Pass_pr  Fail  Total  Top_k  Top_k_2y
Race                                               
B        2  40.000000     3      5    0.0       1.0
H        2  28.571429     5      7    0.0       0.0
W       10  52.631579     9     19    4.0       4.0


In [282]:
# For black
print('\nBlack:', summary['Pass_pr']['B']/summary['Pass_pr']['W'])

# For hispanic
print('\nHispanic:', summary['Pass_pr']['H']/summary['Pass_pr']['W'])


Black: 0.76

Hispanic: 0.5428571428571429


#### Captain

In [283]:
summary = get_summary('Captain')
summary

      Pass  Pass_pr  Fail  Total  Top_k  Top_k_2y
Race                                             
B        1     50.0     1      2    0.0       0.0
H        2     40.0     3      5    0.0       0.0
W        7     70.0     3     10    2.0       2.0


In [284]:
# For black
print('\nBlack:', summary['Pass_pr']['B']/summary['Pass_pr']['W'])

# For hispanic
print('\nHispanic:', summary['Pass_pr']['H']/summary['Pass_pr']['W'])


Black: 0.7142857142857143

Hispanic: 0.5714285714285714
