In [37]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Dropout, Activation
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
import seaborn as sns
import time

### Load data

In mysql db in my local environment but loaded using csvs for portability

In [2]:
#budget data
functions_df = pd.read_csv('../table_csvs/functions_table.csv').set_index(['District_Id','Year'])
programs_df = pd.read_csv('../table_csvs/programs_table.csv').set_index(['District_Id','Year'])
revenue_df = pd.read_csv('../table_csvs/revenue_table.csv').set_index(['District_Id','Year'])

#school data
teachers_df = pd.read_csv('../table_csvs/teachers_table.csv').set_index(['District_Id','Year'])
enrollment_df = pd.read_csv('../table_csvs/enrollment_table.csv').set_index(['District_Id','Year'])
classes_df = pd.read_csv('../table_csvs/classes_table.csv').set_index(['District_Id','Year'])

#evaluation data
test_scores_df = pd.read_csv('../table_csvs/test_scores_table.csv').set_index(['District_Id','Year'])
dropout_rates_df = pd.read_csv('../table_csvs/dropout_rates_table.csv').set_index(['District_Id','Year'])

#Uncomment one at a time to see columns and different metrics of each
# functions_df.describe().T
# programs_df.describe().T
# revenue_df.describe().T

# teachers_df.describe().T
# enrollment_df.describe().T
# classes_df.describe().T

# test_scores_df.describe().T
# dropout_rates_df.describe().T

### Preprocess Data

Create new df for function expenditure per student 

In [3]:
funct_per_student = pd.concat([functions_df, enrollment_df], axis=1)
funct_per_student = funct_per_student.div(funct_per_student['Fall_Enrollment'],axis=0).drop(['Fall_Enrollment'],axis=1)
funct_per_student.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Instruction_All_Funds,Instruction_General_Funds,Instructional_Res_Media_All_Funds,Instructional_Res_Media_General_Funds,Curriculum_Staff_Develop_All_Funds,Curriculum_Staff_Develop_General_Funds,Instructional_Leadership_All_Funds,Instructional_Leadership_General_Funds,School_Administration_All_Funds,School_Administration_General_Funds,...,Plant_Maintenance_Operation_All_Funds,Plant_Maintenance_Operation_General_Funds,Security_Monitoring_All_Funds,Security_Monitoring_General_Funds,Data_Processing_Services_All_Funds,Data_Processing_Services_General_Funds,Community_Services_All_Funds,Community_Services_General_Funds,Total_Expenditure_By_Function_All_Funds,Total_Expenditure_By_Function_General_Funds
District_Id,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1902,2005,4092.806507,3783.885274,103.359589,81.119863,1.919521,1.077055,9.458904,0.0,409.648973,407.044521,...,699.385274,693.847603,2.873288,2.873288,113.123288,112.255137,0.0,0.0,7169.780822,6448.922945
1902,2006,4229.204467,3854.426117,137.572165,135.142612,7.309278,2.637457,27.67354,0.0,420.606529,417.919244,...,853.317869,846.747423,4.790378,4.790378,115.632302,114.761168,0.0,0.0,7593.774914,6777.001718
1902,2007,4644.550088,4307.441125,136.209139,133.720562,8.732865,1.441125,7.029877,0.0,448.697715,448.697715,...,950.420035,950.420035,3.26362,3.26362,136.427065,136.427065,0.0,0.0,8247.968366,7495.776801
1902,2008,7083.765517,4727.293103,137.248276,93.82069,29.255172,7.881034,522.131034,0.0,445.346552,445.346552,...,1045.543103,954.832759,2.643103,2.643103,157.198276,157.198276,0.0,0.0,12788.051724,8023.825862
1902,2009,7393.742424,4988.651515,142.424242,101.846801,19.149832,2.149832,366.493266,0.0,446.126263,446.126263,...,1144.734007,1067.262626,1.464646,1.464646,157.636364,157.636364,0.0,0.0,12902.824916,8302.456229


Create new df for program expenditure per student 

In [4]:
program_per_student = pd.concat([programs_df, enrollment_df], axis=1)
program_per_student = program_per_student.div(program_per_student['Fall_Enrollment'],axis=0).drop(['Fall_Enrollment'],axis=1)
program_per_student.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Regular_All_Funds,Regular_General_Funds,Gifted_Talented_All_Funds,Gifted_Talented_General_Funds,Career_Technology_All_Funds,Career_Technology_General_Funds,Students_With_Disabilities_All_Funds,Students_With_Disabilities_General_Funds,Compensatory_Education_All_Funds,Compensatory_Education_General_Funds,...,Pre_K_Regular_All_Funds,Pre_K_Regular_General_Funds,Pre_K_Bilingual_All_Funds,Pre_K_Bilingual_General_Funds,Pre_K_Comp_Ed_All_Funds,Pre_K_Comp_Ed_General_Funds,Pre_K_Total_All_Funds,Pre_K_Total_General_Funds,Total_Program_Expenditures_All_Funds,Total_Program_Expenditures_General_Funds
District_Id,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1902,2005,3859.306507,3790.794521,17.511986,17.347603,161.032534,159.804795,510.321918,502.47089,371.265411,104.974315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7169.780822,6448.922945
1902,2006,3834.027491,3762.843643,17.798969,17.634021,168.982818,167.675258,588.969072,582.180412,525.262887,193.261168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7593.774914,6777.001718
1902,2007,4186.446397,4162.520211,17.249561,17.249561,201.260105,201.260105,630.653779,630.653779,551.328647,221.335677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8247.968366,7495.776801
1902,2008,4480.765517,4426.168966,17.682759,17.682759,194.972414,194.972414,4636.396552,672.124138,585.068966,310.767241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12788.051724,8023.825862
1902,2009,4782.83165,4702.267677,17.259259,17.259259,224.781145,224.781145,4313.175084,633.122896,614.392256,306.36532,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12902.824916,8302.456229


Create new df for program expenditure per student

In [5]:
revenue_per_student = pd.concat([revenue_df, enrollment_df], axis=1)
revenue_per_student = revenue_per_student.div(revenue_per_student['Fall_Enrollment'],axis=0).drop(['Fall_Enrollment'],axis=1)
revenue_per_student.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total_Revenue_All_Funds,Total_Revenue_General_Funds,Total_Federal_Revenue_All_Funds,Total_Federal_Revenue_General_Funds,Total_State_Revenue_All_Funds,Total_State_Revenue_General_Funds,Total_Local_Revenue_All_Funds,Total_Local_Revenue_General_Funds
District_Id,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1902,2005,9583.652397,8857.416096,410.268836,0.0,1341.828767,1234.905822,6462.398973,6462.398973
1902,2006,8880.910653,8057.941581,507.532646,0.0,1004.886598,898.350515,6604.07732,6604.07732
1902,2007,10726.945518,9933.328647,439.662566,0.0,2067.281195,1958.135325,7245.258348,7245.258348
1902,2008,16678.003448,10166.601724,3193.484483,0.0,4051.95,3181.434483,6278.191379,6278.191379
1902,2009,13931.47138,9493.080808,2887.185185,0.0,4163.927609,3283.112795,5587.818182,5587.818182


Combine budget data and test score data into one df
- Avg_Act has the most present values which is why it is used here
- Could have used program_per_student data here but funct_per_student provides a more general view of where money is spent

In [6]:
#Test score metric used for analysis
test_score_column = 'Avg_Act' #Other options include Avg_Sat, Above_Crit_Rate_Sat_Act, etc

#Combine into one df
dataset = pd.concat([funct_per_student, test_scores_df[test_score_column].to_frame()], axis=1).reset_index()

Clean up the data

In [7]:
dataset = dataset[dataset.Year > 2004] #No budget data for prior to 2004

#get only rows with data for all years 2005-2019
for dist in dataset.District_Id.unique():
    if len(dataset[dataset.District_Id == dist].Year.unique()) != len(range(2005,2020)):
        dataset = dataset[dataset.District_Id != dist]

#remove district with less than 60% of test scores reported
for dist in dataset.District_Id.unique():
    dist_df = dataset[dataset.District_Id == dist]
    tot_rows = dist_df.shape[0]
    act_rows = dist_df[dist_df[test_score_column].notna()].shape[0]
    if act_rows / tot_rows < .6:
        dataset = dataset[dataset.District_Id != dist]

#interpolate missing values
for dist in dataset.District_Id.unique():
    dist_df = dataset[dataset.District_Id == dist]
    dataset[dataset.District_Id == dist] = dist_df.interpolate(limit_direction='both')

Choose which funds to look at:
- General Funds are funds that districts can spend in categories they choose
- All funds include general funds but also inlcude funds that have to be used for specific purposes

In [90]:
filtered_dataset = dataset.filter(regex='(.*_General_Funds$)|(Avg_Act)|(District_Id)|(Year)')
# filtered_dataset = dataset.filter(regex='(.*_All_Funds$)|(Avg_Act)|(District_Id)|(Year)')

#remove any other features
filtered_dataset = filtered_dataset.drop(['Social_Work_Services_General_Funds','Food_General_Funds','Community_Services_General_Funds','Health_Services_General_Funds', 'Transportation_General_Funds', 'Plant_Maintenance_Operation_General_Funds'], axis=1)

filtered_dataset.head()

Unnamed: 0,District_Id,Year,Instruction_General_Funds,Instructional_Res_Media_General_Funds,Curriculum_Staff_Develop_General_Funds,Instructional_Leadership_General_Funds,School_Administration_General_Funds,Guidance_Counseling_Services_General_Funds,Cocurricular_General_Funds,General_Administration_General_Funds,Security_Monitoring_General_Funds,Data_Processing_Services_General_Funds,Total_Expenditure_By_Function_General_Funds,Avg_Act
4,1902,2005,3783.885274,81.119863,1.077055,0.0,407.044521,227.037671,299.875,466.0,2.873288,112.255137,6448.922945,19.2
5,1902,2006,3854.426117,135.142612,2.637457,0.0,417.919244,230.134021,302.065292,488.630584,4.790378,114.761168,6777.001718,19.5
6,1902,2007,4307.441125,133.720562,1.441125,0.0,448.697715,246.123023,327.903339,527.41652,3.26362,136.427065,7495.776801,19.8
7,1902,2008,4727.293103,93.82069,7.881034,0.0,445.346552,254.143103,378.896552,534.712069,2.643103,157.198276,8023.825862,19.1
8,1902,2009,4988.651515,101.846801,2.149832,0.0,446.126263,252.594276,452.66835,435.112795,1.464646,157.636364,8302.456229,18.4


Break dataset into windows so we can attempt to predict the next years test score based on previous results and budget data features

In [91]:
def split_windows(df, window_size=4):
    inputs = np.empty((0, window_size, len(df.columns)))
    labels = np.array([])
    
    for i in range(len(df.Year.unique()) - window_size + 1):
        start_year = 2005
        
        window_df = df.loc[df.Year >= start_year + i].loc[df.Year < start_year + i + window_size]
        ids = len(window_df.District_Id.unique())
        window_array = np.array(window_df)
        
        window_inputs = np.array(np.array_split(window_array, ids))
        window_labels = window_inputs[:,-1,-1].copy()
        window_inputs[:,-1,-1] = 1
        
        inputs = np.concatenate((inputs, window_inputs),axis=0)
        
        labels = np.append(labels,[window_labels])
    
    return inputs, labels

#Scale the data down
label_scaler = MinMaxScaler(feature_range=(0, 1))
scaled_labels = label_scaler.fit_transform(np.array(filtered_dataset)[:,-1].reshape(-1,1))

feature_scaler = MinMaxScaler(feature_range=(0, 1))
scaled_features = feature_scaler.fit_transform(np.array(filtered_dataset)[:,2:-1])

scaled_dataset = pd.DataFrame(np.hstack((np.array(filtered_dataset)[:,:2], scaled_features, scaled_labels)), columns=filtered_dataset.columns)

window = 8
inputs, labels = split_windows(scaled_dataset, window)

### Build the Model

Split train and test data

In [92]:
splits = (np.array([.6,1]) * inputs.shape[0]).astype(int) 

train_inputs = inputs[:splits[0],:,:]
test_inputs = inputs[splits[0]:splits[1],:,:]

train_labels = labels[:splits[0]]
test_labels = labels[splits[0]:splits[1]]

#Remove year and district id
train_inputs = train_inputs[:,:,2:]
test_inputs = test_inputs[:,:,2:]

features = train_inputs.shape[2]

#Create inputs that just consist of past scores
scores_train_inputs = train_inputs[:,:,-1].reshape(train_inputs.shape[0],window,1)
scores_test_inputs = test_inputs[:,:,-1].reshape(test_inputs.shape[0],window,1)

#Create inputs that just consist of just budget data
budget_train_inputs = train_inputs[:,:,:-1]
budget_test_inputs = test_inputs[:,:,:-1]

Create model architecture

In [93]:
EPOCHS = 10
BATCH_SIZE = 32

def create_model(window, features):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.LSTM(units = 64, return_sequences = True, input_shape = (window, features)))
    model.add(Dropout(0.2))

#     model.add(tf.keras.layers.LSTM(units = 64, return_sequences = True, input_shape = (window, features)))
#     model.add(Dropout(0.2))
    
#     model.add(tf.keras.layers.LSTM(units = 64, return_sequences = True, input_shape = (window, features)))
#     model.add(Dropout(0.2))

#     model.add(tf.keras.layers.LSTM(units = 64, return_sequences = True, input_shape = (window, features)))
#     model.add(Dropout(0.2))

    model.add(tf.keras.layers.LSTM(units = 80, return_sequences = False))
    model.add(Dropout(0.2))

    model.add(Dense(units = 1))
    
    model.compile(optimizer = tf.optimizers.Adam(), loss = tf.losses.MeanSquaredError(), metrics=[tf.metrics.MeanAbsoluteError()])
    
    return model

def fit_predict(model, name, train_data, train_labels, test_data, test_labels):
    model.fit(train_data, train_labels, epochs = EPOCHS, batch_size=BATCH_SIZE)
    
    predictions = model.predict(test_data)
    predictions = label_scaler.inverse_transform(predictions)

    actual = label_scaler.inverse_transform(test_labels.reshape(1,-1))

    comp = pd.DataFrame(np.hstack((actual.T, predictions)),columns=['actual','predicted'])
    
    high = np.array(comp[comp['actual'] > 25])
    mid = np.array(comp[comp['actual'] >= 15][comp['actual'] <= 25])
    low = np.array(comp[comp['actual'] < 15])
    tot = np.array(comp)

    Total_MSE = np.sqrt(np.mean((tot[:,0] - tot[:,1])**2))
    High_MSE = np.sqrt(np.mean((high[:,0] - high[:,1])**2))
    Mid_MSE = np.sqrt(np.mean((mid[:,0] - mid[:,1])**2))
    Low_MSE = np.sqrt(np.mean((low[:,0] - low[:,1])**2))
    

    print(comp.head(5))
    print(comp[comp['actual'] > 25].head(5))
    print(comp[comp['actual'] < 15].head(5))
    
    return pd.DataFrame(data=[[Total_MSE, High_MSE, Mid_MSE, Low_MSE]], index=[name], columns=['Total MSE','High MSE','Mid MSE','Low MSE'])

Run model with all data

In [94]:
real_model = create_model(window,features)


real_results = fit_predict(real_model, 'Real', train_inputs, train_labels, test_inputs, test_labels)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
   actual  predicted
0    15.4  16.728285



Run model with garbage data

In [95]:
garbage_model = create_model(window,features)

rand_data = np.random.rand(train_inputs.shape[0],train_inputs.shape[1],train_inputs.shape[2])
garbage_results = fit_predict(garbage_model, 'Garbage', rand_data, train_labels, test_inputs, test_labels)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
   actual  predicted
0    15.4  18.541382



Run model with score data only

In [96]:
scores_only_model = create_model(window,1)

scores_results = fit_predict(scores_only_model, 'Scores_Only', scores_train_inputs, train_labels, scores_test_inputs, test_labels)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
   actual  predicted
0    15.4  17.104103



Run model with budget data only

In [97]:
budget_only_model = create_model(window, budget_train_inputs.shape[2])

budget_results = fit_predict(budget_only_model, 'Budget_Only', budget_train_inputs, train_labels, budget_test_inputs, test_labels)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Bad argument number for Name: 4, expecting 3
   actual  predicted
0    15.4  19.125879



In [98]:
results = pd.concat([real_results, garbage_results, scores_results, budget_results])
results

Unnamed: 0,Total MSE,High MSE,Mid MSE,Low MSE
Real,1.308936,3.069342,1.240489,2.469654
Garbage,2.507757,7.503552,2.303446,4.086674
Scores_Only,1.242136,2.872925,1.173338,2.661373
Budget_Only,1.982687,6.046416,1.804375,3.851373
