# Chronic Absenteeism Rate Prediction (CARP) Decision Tree Ensemble (DTE) Modeling

### Make preparations specific to IBM Watson Studio:  import and configure project utilities, define function to download project assets

In [1]:
# The code was removed by Watson Studio for sharing.

In [2]:
# function to retrieve project assets 
def download(project_file_name,project=None):    
    # get the file
    print("Attempting to get file {}".format(project_file_name))
    _bytes = project.get_file(project_file_name).read()
    
    # download the file
    print("Downloading...")
    
    with open(project_file_name, 'wb') as f: 
        f.write(bytearray(_bytes))
        print("Completed writing out file")
        
    return 0

### Import required modules

In [3]:
import numpy as np
import pandas as pd

import re

import pickle

from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold

### Download datasets and recreate data created with CARP-ETL notebook

In [4]:
abs_file_name = 'la_county_2018_chronic_absence_rates_with_predictor_variables.csv'
download(abs_file_name,project)
abs_18 = pd.read_csv(abs_file_name, index_col='Tract')
np.set_printoptions(linewidth=250)
ranked_corr_file_name = 'ranked_correlates.csv'
download(ranked_corr_file_name,project)
ranked_corr = pd.read_csv(ranked_corr_file_name,header=None,names=['Correlate'])
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 220)
print('\n***Prepared Data***')
print(abs_18.head())
print('\n***Top Correlates***')
print(ranked_corr.head())

Attempting to get file la_county_2018_chronic_absence_rates_with_predictor_variables.csv
Downloading...
Completed writing out file
Attempting to get file ranked_correlates.csv
Downloading...
Completed writing out file

***Prepared Data***
        Year    Percent  Count  Total  Tract_Nbr   Income  Pct_HS  Pct_Bach  Pct_Eng  Pct_White  Pct_Black  Pct_Native  Pct_Asian  Pct_Pac_Isl  Pct_Other  Pct_Mixed  Pct_LF_Part  Pct_EP_Ratio  Pct_Unemp  Pct_Dis_0-18  \
Tract                                                                                                                                                                                                                      
101110  2018  11.076923     36    325     101110  51209.0    78.6      21.5     78.3  77.489177   2.040816    0.000000   4.761905          0.0  12.059369   3.648732         62.7          56.6        9.6      3.364486   
101122  2018   9.251102     21    227     101122  85460.0    91.8      25.7     88.7  86.359901   0.0

### Define and run decision tree ensemble model

In [5]:
abs_18[abs_18.isnull().any(axis=1)].shape

(0, 44)

In [9]:
# Select variables most highly correlated with target for model fitting -- this eliminates some noise
input_dim = -1 #use all predictor variables, since decision 
variables = list(ranked_corr[:input_dim]['Correlate'].values)

# Define predictor and target while capturing mean and standard deviation to reinflate scaled data

X = abs_18[variables]
X_mean,X_std = X.mean(),X.std()
y = abs_18['Percent']
y_mean,y_std = y.mean(),y.std()



# Set parameters for iterative model fitting and prediction

target_iterations = 5

# Define lists to accumulate the results of each iteration

train_scores=[]
test_scores=[]
X_tests=[]
y_tests=[]
y_test_preds=[]

# Define ensemble decision tree regression model
estimator=AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),n_estimators=600, learning_rate=1.0, loss='square')

# Iterate model fitting and prediction specified number of times while capturing results for model evaluation
total_test_score = 0
for iteration in range (target_iterations):
    
    iteration += 1
    print(f'\nIteration {iteration}:\n')
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=iteration)

    estimator.fit(X_train, y_train)
    y_train_pred=estimator.predict(X_train)
    y_test_pred=estimator.predict(X_test)

    train_score = metrics.r2_score(y_train,y_train_pred)
    test_score = metrics.r2_score(y_test,y_test_pred)
    print('Train r2 Score:', train_score,'Test r2 Score:', test_score)
        
    X_tests.append(X_test)
    y_tests.append(y_test)
    y_test_preds.append(y_test_pred)
    train_scores.append(train_score)
    test_scores.append(test_score)
    
    total_test_score += test_score

print ('\nAverage test score:',total_test_score / target_iterations)


Iteration 1:

Train r2 Score: 0.9960272074600577 Test r2 Score: 0.5983189235396512

Iteration 2:

Train r2 Score: 0.9960074735770638 Test r2 Score: 0.6516384462238736

Iteration 3:

Train r2 Score: 0.9961653638237584 Test r2 Score: 0.6601956131029549

Iteration 4:

Train r2 Score: 0.9958769941537481 Test r2 Score: 0.66127481116916

Iteration 5:

Train r2 Score: 0.9955830601824529 Test r2 Score: 0.5966527360082517

Average test score: 0.6336161060087784


### Save model results as project assets for evaluation by CARP-EVAL notebook

In [7]:
dte_results_file_name = 'dte_results.p'
dte_results=dict(X_mean=X_mean, X_std=X_std, X_tests=X_tests, y_tests=y_tests,y_test_preds=y_test_preds, train_scores=train_scores, test_scores=test_scores)
pickled_dte_results = pickle.dumps(dte_results)
project.save_data(dte_results_file_name, pickled_dte_results, set_project_asset=True, overwrite=True)

{'file_name': 'dte_results.p',
 'message': 'File saved to project storage.',
 'bucket_name': 'iverpyspark-donotdelete-pr-ysp8udweullapt',
 'asset_id': '9f673bff-b75b-455a-833f-06391a602df2'}