# Chronic Absenteeism Rate Prediction (CARP) Deep Neural Network (DNN) Modeling

### Make preparations specific to IBM Watson Studio:  import and configure project utilities, define function to download project assets

In [1]:
# The code was removed by Watson Studio for sharing.

In [2]:
# function to retrieve project assets 
def download(project_file_name,project=None):    
    # get the file
    print("Attempting to get file {}".format(project_file_name))
    _bytes = project.get_file(project_file_name).read()
    
    # download the file
    print("Downloading...")
    
    with open(project_file_name, 'wb') as f: 
        f.write(bytearray(_bytes))
        print("Completed writing out file")
        
    return 0

### Import required modules

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import pickle

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, ShuffleSplit

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


### Download datasets and recreate data created with CARP-ETL notebook

In [4]:
abs_file_name = 'la_county_2018_chronic_absence_rates_with_predictor_variables.csv'
download(abs_file_name,project)
abs_18 = pd.read_csv(abs_file_name, index_col='Tract')
np.set_printoptions(linewidth=250)
ranked_corr_file_name = 'ranked_correlates.csv'
download(ranked_corr_file_name,project)
ranked_corr = pd.read_csv(ranked_corr_file_name,header=None,names=['Correlate'])
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 220)
print('\n***Prepared Data***')
print(abs_18.head())
print('\n***Top Correlates***')
print(ranked_corr.head())

Attempting to get file la_county_2018_chronic_absence_rates_with_predictor_variables.csv
Downloading...
Completed writing out file
Attempting to get file ranked_correlates.csv
Downloading...
Completed writing out file

***Prepared Data***
        Year    Percent  Count  Total  Tract_Nbr   Income  Pct_HS  Pct_Bach  Pct_Eng  Pct_White  Pct_Black  Pct_Native  Pct_Asian  Pct_Pac_Isl  Pct_Other  Pct_Mixed  Pct_LF_Part  Pct_EP_Ratio  Pct_Unemp  Pct_Dis_0-18  \
Tract                                                                                                                                                                                                                      
101110  2018  11.076923     36    325     101110  51209.0    78.6      21.5     78.3  77.489177   2.040816    0.000000   4.761905          0.0  12.059369   3.648732         62.7          56.6        9.6      3.364486   
101122  2018   9.251102     21    227     101122  85460.0    91.8      25.7     88.7  86.359901   0.0

### Define and run deep learning model

In [5]:
# Define deep neural network
input_dim = 25 # define how many of the ranked correlates to use as predictor varFiables
ki='normal'

def nn_model(layer_1_dim=input_dim):
    model = Sequential()
    model.add(Dense(layer_1_dim, input_dim=layer_1_dim, kernel_initializer=ki, activation='relu'))
    layer_2_dim = (layer_1_dim // 3) * 2 + 1
    model.add(Dense(layer_2_dim,kernel_initializer=ki, activation='relu'))
    layer_3_dim = (layer_2_dim // 3) * 2 + 1
    model.add(Dense(layer_3_dim,kernel_initializer=ki, activation='relu'))
    model.add(Dense(1, kernel_initializer=ki))
    adam = Adam(lr=0.01, decay=0.001)
    model.compile(loss='mean_squared_error', optimizer=adam)
    return model

# Select variables most highly correlated with target for model fitting -- this eliminates some noise
variables = list(ranked_corr[:input_dim]['Correlate'].values)


# Define predictor and target while capturing mean and standard deviation to reinflate scaled data

X = abs_18[variables]
X_mean,X_std = X.mean(),X.std()
y = abs_18['Percent']
y_mean,y_std = y.mean(),y.std()

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
X_scaler = scaler.fit(X)
y_scaler = scaler.fit(y.values.reshape(-1, 1))

X_scaled = X_scaler.transform(X)

y_scaled = y_scaler.transform(y.values.reshape(-1, 1))

# Set parameters for iterative model fitting and prediction

target_iterations = 5
max_epochs=40
min_epochs=25
max_loss = .62

# Initialize accumulators for run
completed_epochs = -1
completed_iterations = 0

# Define lists to accumulate the results of each iteration

train_scores=[]
test_scores=[]
X_tests=[]
y_tests=[]
y_test_preds=[]


# Define estimator

early_stop = EarlyStopping(monitor='loss', min_delta=0.0005, patience=5, verbose=1, mode='min')

estimator = KerasRegressor(build_fn=nn_model, epochs=max_epochs, batch_size=12, verbose=1, callbacks=[early_stop])

# Iterate model fitting and prediction specified number of times while capturing results for model evaluation

for iteration in range (target_iterations):
    
    iteration += 1
    print(f'\nIteration {iteration}:\n')
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, shuffle=True, random_state=iteration)

    # re-fit estimator until minimum epochs are completed without early stopping
    in_progress = True
    while in_progress:
        nn_history = estimator.fit(X_train,y_train)
        completed_epochs = len(nn_history.history['loss'])
        loss = nn_history.history['loss'][-1]
        if completed_epochs >= max_epochs or (completed_epochs >= min_epochs and loss <= max_loss):
            in_progress = False

    y_train_pred=estimator.predict(X_train)
    y_test_pred=estimator.predict(X_test)

    train_score = metrics.r2_score(y_train,y_train_pred)
    test_score = metrics.r2_score(y_test,y_test_pred)
    print('Train r2 Score:', train_score,'Test r2 Score:', test_score)
        
    X_tests.append(X_test)
    y_tests.append(y_test)
    y_test_preds.append(y_test_pred)
    train_scores.append(train_score)
    test_scores.append(test_score)


Iteration 1:

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train r2 Score: 0.49725528115137596 Test r2 Score: 0.4502929174522652

Iteration 2:

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 00011: early stopping
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 00007: early stopping
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/4

### Save modeling results as project assets for evaluation by CARP-EVAL notebook

In [7]:

nn_results_file_name = 'nn_results.p'
nn_results=dict(X_mean=X_mean, X_std=X_std, X_tests=X_scaler.inverse_transform(X_tests), y_tests=y_scaler.inverse_transform(y_tests),
                y_test_preds=y_scaler.inverse_transform(y_test_preds), train_scores=train_scores, test_scores=test_scores)
pickled_nn_results = pickle.dumps(nn_results)
project.save_data(nn_results_file_name, pickled_nn_results, set_project_asset=True, overwrite=True)

{'file_name': 'nn_results.p',
 'message': 'File saved to project storage.',
 'bucket_name': 'iverpyspark-donotdelete-pr-ysp8udweullapt',
 'asset_id': '0d1bd654-82cf-4396-b581-753af5458440'}