# Random Forest Models

**J - This notebook contains random forest models.There are four tagret variables - percentage ks2 expectations, percentage ks4 expectations, percentage substance misuse and percentage convictions. All of these are modelled individually one after the other.**

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install imbalanced-learn
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint
from sklearn.tree import export_graphviz
import graphviz
from IPython.display import Image



# Reading in data

In [None]:
data = pd.read_csv("final_data.csv")
data['la_name'] = pd.Categorical(data['la_name'])
data['reporting_year'] = data['reporting_year'].astype(int)
data.head()

Unnamed: 0.1,Unnamed: 0,la_code,reporting_year,academic_year,region_code,region_name,la_name,pt_ks2_expectations,pt_ks4_expectations,avg_p8score,num_convictions,num_placed_inside_la,pt_convictions,pt_placed_inside_la,num_substance,num_private_provis,pt_substance,pt_private_provis,num_3_more_placemts,pt_3_more_placemts,inspection_date,ofsted_overall,ofsted_leaders,ofsted_help_protection,ofsted_care,cla_budget,number_in_care,number_in_care_filled,num_convictions_filled,num_placed_inside_la_filled,num_substance_filled,num_private_provis_filled,num_3_more_placemts_filled,pt_convictions_filled,pt_placed_inside_la_filled,pt_substance_filled,pt_private_provis_filled,pt_3_more_placemts_filled,pt_ks2_expectations_filled,pt_ks4_expectations_filled,avg_p8score_filled,per_child_spend,cla_budget_filled,ofsted_overall_filled,ofsted_leaders_filled,ofsted_care_filled,ofsted_help_protection_filled,budget_per_child
0,729,E09000002,2018,201718.0,E13000002,Outer London,Barking and Dagenham,43.0,,-1.12,11.0,153.0,5.0,37.0,0.0,138.0,0.0,34.0,44.0,11.0,,,,,,21178.0,409.0,409.0,11.0,153.0,0.0,138.0,44.0,5.0,37.0,0.0,34.0,11.0,43.0,26.04,-1.12,51.779951,21178.0,2.0,2.0,2.0,2.0,51.779951
1,580,E09000002,2019,201819.0,E13000002,Outer London,Barking and Dagenham,59.0,28.6,-0.74,7.0,163.0,3.0,39.0,,129.0,,31.0,41.0,10.0,,,,,,20019.0,414.0,414.0,7.0,163.0,1.0,129.0,41.0,3.0,39.0,0.241546,31.0,10.0,59.0,28.6,-0.74,48.355072,20019.0,2.0,2.0,2.0,2.0,48.355072
2,431,E09000002,2020,201920.0,E13000002,Outer London,Barking and Dagenham,,37.9,,,155.0,,39.0,,137.0,,34.0,36.0,9.0,,,,,,19372.0,402.0,402.0,3.0,155.0,6.0,137.0,36.0,0.746269,39.0,1.492537,34.0,9.0,48.777778,37.9,-1.315192,48.189055,19372.0,2.0,2.0,2.0,2.0,48.189055
3,280,E09000002,2021,202021.0,E13000002,Outer London,Barking and Dagenham,,30.8,,,147.0,,38.0,0.0,144.0,0.0,37.0,43.0,11.0,,,,,,18399.0,391.0,391.0,5.0,147.0,0.0,144.0,43.0,1.278772,38.0,0.0,37.0,11.0,48.777778,30.8,-1.315192,47.056266,18399.0,2.0,2.0,2.0,2.0,47.056266
4,130,E09000002,2022,202122.0,E13000002,Outer London,Barking and Dagenham,50.0,,-1.29,,163.0,,39.0,0.0,153.0,0.0,37.0,52.0,13.0,10/07/2023,2.0,2.0,2.0,2.0,21009.0,413.0,413.0,4.0,163.0,0.0,153.0,52.0,0.968523,39.0,0.0,37.0,13.0,50.0,31.557143,-1.29,50.869249,21009.0,2.0,2.0,2.0,2.0,50.869249


# Scaling & checking categorical variables

In [None]:
#create scaled version of data, all measurements should be on 0-1

scaler = MinMaxScaler()
data_scaled = data.copy()
scalable_columns = ['number_in_care_filled', 'budget_per_child', 'pt_ks2_expectations', 'pt_ks4_expectations', 'avg_p8score', 'pt_convictions', 'pt_placed_inside_la', 'pt_substance', 'pt_private_provis', 'pt_3_more_placemts', 'pt_convictions_filled', 'pt_placed_inside_la_filled', 'pt_substance_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'pt_ks2_expectations_filled', 'pt_ks4_expectations_filled', 'avg_p8score_filled', 'per_child_spend']
data_scaled[scalable_columns] = scaler.fit_transform(data_scaled[scalable_columns])
print(data_scaled[scalable_columns].describe())


#check categorical variables are set as categories

data_scaled['la_name'] = data_scaled['la_name'].astype('category')
data_scaled['la_code'] = data_scaled['la_code'].astype('category')
data_scaled['ofsted_overall'] = data_scaled['ofsted_overall'].astype('category')
data_scaled['ofsted_leaders'] = data_scaled['ofsted_leaders'].astype('category')
data_scaled['ofsted_care'] = data_scaled['ofsted_care'].astype('category')
data_scaled['ofsted_help_protection'] = data_scaled['ofsted_help_protection'].astype('category')


#separate out filled and unfilled data

#will need to ensure we include budget_per_child filled once available
filled_cols = ['number_in_care_filled', 'region_name', 'ofsted_leaders_filled', 'per_child_spend','ofsted_care_filled', 'ofsted_help_protection_filled','la_name', 'reporting_year','ofsted_overall_filled' ,'pt_convictions_filled', 'pt_placed_inside_la_filled', 'pt_substance_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'pt_ks2_expectations_filled', 'pt_ks4_expectations_filled', 'avg_p8score_filled']
original_cols = ['number_in_care', 'region_name','la_name', 'reporting_year', 'ofsted_overall','ofsted_care', 'ofsted_help_protection', 'ofsted_leaders', 'budget_per_child', 'pt_ks2_expectations', 'pt_ks4_expectations', 'avg_p8score', 'pt_convictions', 'pt_placed_inside_la', 'pt_substance', 'pt_private_provis', 'pt_3_more_placemts']
filleddata = data_scaled[filled_cols]
originaldata = data_scaled[original_cols]

originaldata.reset_index(drop=True, inplace=True)
filleddata.reset_index(drop = True, inplace = True)

       number_in_care_filled  budget_per_child  pt_ks2_expectations  \
count             750.000000        750.000000           229.000000   
mean                0.246364          0.352640             0.338638   
std                 0.165839          0.126835             0.173686   
min                 0.000000          0.000000             0.000000   
25%                 0.140736          0.260079             0.210526   
50%                 0.202375          0.339699             0.315789   
75%                 0.303919          0.425106             0.438596   
max                 1.000000          1.000000             1.000000   

       pt_ks4_expectations  avg_p8score  pt_convictions  pt_placed_inside_la  \
count           469.000000   431.000000      482.000000           747.000000   
mean              0.449708     0.605523        0.237621             0.613078   
std               0.146512     0.138589        0.156554             0.189203   
min               0.000000     0.000000 

## **1) Target - pt_ks2_expectations filled**

Creating Data Frame, setting target, splitting data

In [None]:
df = pd.DataFrame(filleddata, columns=['pt_placed_inside_la_filled','pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled' ])

In [None]:
df['target'] = filleddata['pt_ks2_expectations_filled']

In [None]:
train, test = train_test_split(df, test_size=0.25, random_state=7)
X_train = train[['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']]
X_test = test[['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']]
y_train = train['target']
y_test = test['target']

Initial model

In [None]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)

In [None]:
y_pred_train = reg.predict(X_train)
mean_absolute_error(y_train, y_pred_train)

0.03308173403665586

In [None]:
y_pred_test = reg.predict(X_test)
mean_absolute_error(y_test, y_pred_test)

0.08354686342698993

In [None]:
input_features = ['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']

RandomizedSearchCV

In [None]:
param_distributions = {
    'n_estimators': randint(100, 1000),
    'max_features': ['auto', 'sqrt'],
    'max_depth': randint(10, 100),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 50),
    'bootstrap': [True, False]
}

rf_regressor = RandomForestRegressor()

rnd_search_cv = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_distributions,
                                   n_iter=100,
                                   cv=5, scoring='neg_mean_squared_error',
                                   random_state=42, n_jobs=-1)

rnd_search_cv.fit(X_train, y_train)

print("Best parameters found:", rnd_search_cv.best_params_)


  warn(


Best parameters found: {'bootstrap': True, 'max_depth': 24, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 620}


First round of GridSearchCV

In [None]:
param_grid_1 = {
    'bootstrap': [True],
    'max_depth': [18, 24, 30],
    'max_features': [1.0],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [6, 8, 10],
    'n_estimators': [600, 620, 640]
}


model = RandomForestRegressor(random_state=42)

grid_search_1 = GridSearchCV(estimator=model, param_grid=param_grid_1, cv=5,
                             scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_1.fit(X_train, y_train)

print("Best parameters from round 1:", grid_search_1.best_params_)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters from round 1: {'bootstrap': True, 'max_depth': 18, 'max_features': 1.0, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 640}


Second round of GridSearchCV

In [None]:
param_grid_2 = {
    'bootstrap': [True],
    'max_depth': [16, 18, 20],
    'max_features': [1.0],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [5, 6, 7],
    'n_estimators': [625, 640, 655]
},

model = RandomForestRegressor(random_state=42)

grid_search_2 = GridSearchCV(estimator=model, param_grid=param_grid_2, cv=5,
                             scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_2.fit(X_train, y_train)

print("Best parameters from round 2:", grid_search_2.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters from round 2: {'bootstrap': True, 'max_depth': 16, 'max_features': 1.0, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 625}


Fitting model to final parameters

In [None]:
optimized_rf = RandomForestRegressor(
    bootstrap=True,
    max_depth=16,
    max_features=1.0,
    min_samples_leaf=4,
    min_samples_split=5,
    n_estimators=625,
    random_state=42
)

optimized_rf.fit(X_train, y_train)

MSE, RMSE, standardised RMSE, feature importances

In [None]:
y_pred = optimized_rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)*10000
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mse)

Mean Absolute Error: 0.08350755262540153
Mean Squared Error: 139.7079145207643


In [None]:
rmse = np.sqrt(mse)
standardised_rmse = rmse / np.std(y_test*100)
print("RMSE:", rmse)
print("standardised RMSE", standardised_rmse)

RMSE: 11.819810257392641
standardised RMSE 0.7707110135355077


In [None]:
feature_importances = optimized_rf.feature_importances_
features_and_importances = list(zip(input_features, feature_importances))

feature_importances = pd.Series(optimized_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

print(feature_importances)

sorted_indices = np.argsort(feature_importances)[::-1]


number_in_care_filled            0.284843
pt_placed_inside_la_filled       0.220099
reporting_year                   0.165607
per_child_spend                  0.131583
pt_private_provis_filled         0.082672
pt_3_more_placemts_filled        0.045725
ofsted_leaders_filled            0.025882
ofsted_care_filled               0.023696
ofsted_help_protection_filled    0.010456
ofsted_overall_filled            0.009438
dtype: float64


# **2) Target - pt_ks4_expectations filled**

Adjusting target and splitting data

In [None]:
df['target'] = filleddata['pt_ks4_expectations_filled']
df.head()

Unnamed: 0,pt_placed_inside_la_filled,pt_private_provis_filled,pt_3_more_placemts_filled,per_child_spend,reporting_year,number_in_care_filled,ofsted_overall_filled,ofsted_leaders_filled,ofsted_help_protection_filled,ofsted_care_filled,target
0,0.425287,0.339797,0.275,0.29344,2018,0.189549,2.0,2.0,2.0,2.0,0.456042
1,0.448276,0.307328,0.25,0.270335,2019,0.191924,2.0,2.0,2.0,2.0,0.500876
2,0.448276,0.339797,0.225,0.269215,2020,0.186223,2.0,2.0,2.0,2.0,0.663748
3,0.436782,0.372266,0.275,0.261573,2021,0.180998,2.0,2.0,2.0,2.0,0.539405
4,0.448276,0.372266,0.325,0.287296,2022,0.191449,2.0,2.0,2.0,2.0,0.552664


In [None]:
train, test = train_test_split(df, test_size=0.25, random_state= 7)
X_train = train[['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']]
X_test = test[['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']]
y_train = train['target']
y_test = test['target']

Initial model

In [None]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)

In [None]:
y_pred_train = reg.predict(X_train)
mean_absolute_error(y_train, y_pred_train)

0.032129524232452994

In [None]:
y_pred_test = reg.predict(X_test)
mean_absolute_error(y_test, y_pred_test)

0.07575134204853311

RandomisedSearchCV

In [None]:
param_distributions = {
    'n_estimators': randint(100, 1000),
    'max_features': ['auto', 'sqrt'],
    'max_depth': randint(10, 100),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 50),
    'bootstrap': [True, False]
}

rf_regressor = RandomForestRegressor()

rnd_search_cv = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_distributions,
                                   n_iter=100,
                                   cv=5, scoring='neg_mean_squared_error',
                                   random_state=42, n_jobs=-1)

rnd_search_cv.fit(X_train, y_train)

print("Best parameters found:", rnd_search_cv.best_params_)


  warn(


Best parameters found: {'bootstrap': True, 'max_depth': 44, 'max_features': 'auto', 'min_samples_leaf': 17, 'min_samples_split': 9, 'n_estimators': 704}


First round of GridSearchCV

In [None]:
param_grid_1 = {
    'bootstrap': [True],
    'max_depth': [40, 44, 48],
    'max_features': [1.0],
    'min_samples_leaf': [15, 17, 19],
    'min_samples_split': [7, 9, 11],
    'n_estimators': [695, 704, 715]
}

model = RandomForestRegressor(random_state=42)

grid_search_1 = GridSearchCV(estimator=model, param_grid=param_grid_1, cv=5,
                             scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_1.fit(X_train, y_train)

print("Best parameters found:", grid_search_1.best_params_)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found: {'bootstrap': True, 'max_depth': 40, 'max_features': 1.0, 'min_samples_leaf': 15, 'min_samples_split': 7, 'n_estimators': 704}


Second round of GridSearchCV

In [None]:
param_grid_2 = {
    'bootstrap': [True],
    'max_depth': [38, 40, 42],
    'max_features': [1.0],
    'min_samples_leaf': [14, 15, 16],
    'min_samples_split': [6, 7, 8],
    'n_estimators': [700, 704, 708]
}

grid_search_2 = GridSearchCV(estimator=model, param_grid=param_grid_2, cv=5,
                             scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_2.fit(X_train, y_train)

print("Best parameters from round 2:", grid_search_2.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters from round 2: {'bootstrap': True, 'max_depth': 38, 'max_features': 1.0, 'min_samples_leaf': 14, 'min_samples_split': 6, 'n_estimators': 700}


Final model on best parameters

In [None]:
optimized_rf = RandomForestRegressor(
    bootstrap=True,
    max_depth=38,
    max_features=1.0,
    min_samples_leaf=14,
    min_samples_split=6,
    n_estimators=700,
    random_state=42
)

optimized_rf.fit(X_train, y_train)

MSE, RMSE, standardised RMSE, feature importances

In [None]:
y_pred = optimized_rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)*10000
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mse)


Mean Absolute Error: 0.0708653261119299
Mean Squared Error: 97.31210815722952


In [None]:
rmse = np.sqrt(mse)
standardised_rmse = rmse / np.std(y_test*100)
print("Root MSE:", rmse)
print("Standardised RMSE:", standardised_rmse)

Root MSE: 9.86468996761832
Standardised RMSE: 0.7931229899376395


In [None]:
feature_importances = optimized_rf.feature_importances_
features_and_importances = list(zip(input_features, feature_importances))
feature_importances = pd.Series(optimized_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

print(feature_importances)

sorted_indices = np.argsort(feature_importances)[::-1]

reporting_year                   0.462883
number_in_care_filled            0.289974
pt_placed_inside_la_filled       0.078572
per_child_spend                  0.062490
pt_private_provis_filled         0.053359
pt_3_more_placemts_filled        0.033768
ofsted_leaders_filled            0.006725
ofsted_care_filled               0.004705
ofsted_help_protection_filled    0.004605
ofsted_overall_filled            0.002918
dtype: float64


# **3) Target - substance misuse filled**

Adjusting target

In [None]:
df['target'] = filleddata['pt_substance_filled']
df.head()

Unnamed: 0,pt_placed_inside_la_filled,pt_private_provis_filled,pt_3_more_placemts_filled,per_child_spend,reporting_year,number_in_care_filled,ofsted_overall_filled,ofsted_leaders_filled,ofsted_help_protection_filled,ofsted_care_filled,target
0,0.425287,0.339797,0.275,0.29344,2018,0.189549,2.0,2.0,2.0,2.0,0.0
1,0.448276,0.307328,0.25,0.270335,2019,0.191924,2.0,2.0,2.0,2.0,0.007792
2,0.448276,0.339797,0.225,0.269215,2020,0.186223,2.0,2.0,2.0,2.0,0.048146
3,0.436782,0.372266,0.275,0.261573,2021,0.180998,2.0,2.0,2.0,2.0,0.0
4,0.448276,0.372266,0.325,0.287296,2022,0.191449,2.0,2.0,2.0,2.0,0.0


In [None]:
train, test = train_test_split(df, test_size=0.25, random_state=7)
X_train = train[['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']]
X_test = test[['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']]
y_train = train['target']
y_test = test['target']

Initial model

In [None]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)

In [None]:
y_pred_train = reg.predict(X_train)
mean_absolute_error(y_train, y_pred_train)

0.02973178194079212

In [None]:
y_pred_test = reg.predict(X_test)
mean_absolute_error(y_test, y_pred_test)

0.07719988077752211

RandomisedSearchCV

In [None]:
param_distributions = {
    'n_estimators': randint(100, 1000),
    'max_features': ['auto', 'sqrt'],
    'max_depth': randint(10, 100),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 50),
    'bootstrap': [True, False]
}

rf_regressor = RandomForestRegressor()

rnd_search_cv = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_distributions,
                                   n_iter=100,
                                   cv=5, scoring='neg_mean_squared_error',
                                   random_state=42, n_jobs=-1)

rnd_search_cv.fit(X_train, y_train)

print("Best parameters found:", rnd_search_cv.best_params_)

Best parameters found: {'bootstrap': False, 'max_depth': 39, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 559}


First round of GridSearchCV

In [None]:
param_grid_1 = {
    'bootstrap': [False],
    'max_depth': [34, 39, 44],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2],
    'n_estimators': [545, 559, 575]
}

model = RandomForestRegressor(random_state=42)

grid_search_1 = GridSearchCV(estimator=model, param_grid=param_grid_1, cv=5,
                             scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_1.fit(X_train, y_train)

print("Best parameters found:", grid_search_1.best_params_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters found: {'bootstrap': False, 'max_depth': 34, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 545}


Second round of GridSearchCV

In [None]:
param_grid_2 = {
    'bootstrap': [False],
    'max_depth': [32, 34, 36],
    'max_features': ['sqrt'],
    'min_samples_leaf': [2],
    'min_samples_split': [2],
    'n_estimators': [540, 545, 550]
}

grid_search_2 = GridSearchCV(estimator=model, param_grid=param_grid_2, cv=5,
                             scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_2.fit(X_train, y_train)

print("Best parameters from round 2:", grid_search_2.best_params_)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters from round 2: {'bootstrap': False, 'max_depth': 32, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 545}


Final model on best parameters

In [None]:
optimized_rf = RandomForestRegressor(
    bootstrap=False,
    max_depth=32,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=545,
    random_state=42
)

optimized_rf.fit(X_train, y_train)

MSE, RMSE, standardised RMSE and feature importances

In [None]:
y_pred = optimized_rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)*10000
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mse)


Mean Absolute Error: 0.07721754087492529
Mean Squared Error: 98.35625223305394


In [None]:
rmse = np.sqrt(mse)
standardised_rmse = rmse / np.std(y_test*100)
print("Root MSE:", rmse)
print("Standardised MSE:", standardised_rmse)

Root MSE: 9.9174720686803
Standardised MSE: 0.9167099984048486


In [None]:
feature_importances = optimized_rf.feature_importances_
features_and_importances = list(zip(input_features, feature_importances))

feature_importances = pd.Series(optimized_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

print(feature_importances)

sorted_indices = np.argsort(feature_importances)[::-1]

number_in_care_filled            0.197061
pt_placed_inside_la_filled       0.181094
pt_private_provis_filled         0.162463
per_child_spend                  0.155432
pt_3_more_placemts_filled        0.099596
reporting_year                   0.057238
ofsted_leaders_filled            0.053144
ofsted_care_filled               0.033102
ofsted_help_protection_filled    0.030823
ofsted_overall_filled            0.030046
dtype: float64


# **4) Target - convictions filled**

Adjusting target and splitting data

In [None]:
df['target'] = filleddata['pt_convictions_filled']

In [None]:
train, test = train_test_split(df, test_size=0.25, random_state = 7)
X_train = train[['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']]
X_test = test[['pt_placed_inside_la_filled', 'pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'reporting_year', 'number_in_care_filled', 'ofsted_overall_filled', 'ofsted_leaders_filled', 'ofsted_help_protection_filled', 'ofsted_care_filled']]
y_train = train['target']
y_test = test['target']

In [None]:
reg = RandomForestRegressor()
reg.fit(X_train, y_train)

Initial model

In [None]:
y_pred_train = reg.predict(X_train)
mean_absolute_error(y_train, y_pred_train)

0.030509524475719287

In [None]:
y_pred_test = reg.predict(X_test)
mean_absolute_error(y_test, y_pred_test)

0.08960476361580672

RandomisedSearchCV

In [None]:
param_distributions = {
    'n_estimators': randint(100, 1000),
    'max_features': ['auto', 'sqrt'],
    'max_depth': randint(10, 100),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 50),
    'bootstrap': [True, False]
}

rf_regressor = RandomForestRegressor()

rnd_search_cv = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_distributions,
                                   n_iter=100,
                                   cv=5, scoring='neg_mean_squared_error',
                                   random_state=42, n_jobs=-1)

rnd_search_cv.fit(X_train, y_train)

print("Best parameters found:", rnd_search_cv.best_params_)

Best parameters found: {'bootstrap': True, 'max_depth': 91, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 12, 'n_estimators': 572}


First round of GridSearchCV

In [None]:
param_grid_1 = {
    'bootstrap': [True],
    'max_depth': [88, 91, 94],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [ 11, 12, 13],
    'n_estimators': [564, 572, 580]
}

model = RandomForestRegressor(random_state=42)

grid_search_1 = GridSearchCV(estimator=model, param_grid=param_grid_1, cv=5,
                             scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_1.fit(X_train, y_train)

print("Best parameters found:", grid_search_1.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found: {'bootstrap': True, 'max_depth': 88, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 13, 'n_estimators': 572}


Second round of GridSearchCV

In [None]:
param_grid_2 = {
    'bootstrap': [True],
    'max_depth': [88, 90, 92],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1],
    'min_samples_split': [12, 13, 14],
    'n_estimators': [570, 572, 574]
}

grid_search_2 = GridSearchCV(estimator=model, param_grid=param_grid_2, cv=5,
                             scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

grid_search_2.fit(X_train, y_train)

print("Best parameters from round 2:", grid_search_2.best_params_)



Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters from round 2: {'bootstrap': True, 'max_depth': 88, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 13, 'n_estimators': 572}


Final model on best parameters

In [None]:
optimized_rf = RandomForestRegressor(
    bootstrap=False,
    max_depth=88,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=13,
    n_estimators=572,
    random_state=42
)

optimized_rf.fit(X_train, y_train)

MSE, RMSE, standardised MSE and feature importances

In [None]:
y_pred = optimized_rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)*10000
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mse)

Mean Absolute Error: 0.08687656081330349
Mean Squared Error: 142.72352078695656


In [None]:
rmse = np.sqrt(mse)
standardised_rmse = rmse / np.std(y_test*100)
print("Root MSE:", rmse)
print("Standardised MSE:", standardised_rmse)

Root MSE: 11.946694973378895
Standardised MSE: 1.0155741770454818


In [None]:
feature_importances = optimized_rf.feature_importances_
features_and_importances = list(zip(input_features, feature_importances))

feature_importances = pd.Series(optimized_rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

print(feature_importances)

sorted_indices = np.argsort(feature_importances)[::-1]

per_child_spend                  0.190234
number_in_care_filled            0.174126
pt_placed_inside_la_filled       0.155935
pt_3_more_placemts_filled        0.140667
reporting_year                   0.108806
pt_private_provis_filled         0.107185
ofsted_leaders_filled            0.031742
ofsted_overall_filled            0.030875
ofsted_care_filled               0.030478
ofsted_help_protection_filled    0.029951
dtype: float64


Example of decision tree

In [None]:
dot_data = export_graphviz(optimized_rf.estimators_[0], out_file=None,
                           filled=True, rounded=True, feature_names=input_features)

graph = graphviz.Source(dot_data, format='png')
graph.render('decision_tree_default', view=False)  # Save as PNG

Image(filename='decision_tree_default.png')


Output hidden; open in https://colab.research.google.com to view.