# Summary of this page so far:

This page contains the Linear Regressions of the unfilled data for the four outcome variables; KS2, K24, Substance and Convictions.


# Importing packages

In [46]:
import pandas as pd
import numpy as np
!pip install statsmodels
import statsmodels.api as sm
from patsy import dmatrices
import statsmodels.formula.api as smf
data = pd.read_csv("final_data.csv")
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



# Scale numerical variables and make sure categorical ones are set as categories

In [47]:
scaler = MinMaxScaler()
data_scaled = data.copy()
scalable_columns = ['number_in_care', 'number_in_care_filled', 'pt_placed_inside_la', 'pt_private_provis', 'pt_3_more_placemts', 'pt_placed_inside_la_filled','pt_private_provis_filled', 'pt_3_more_placemts_filled', 'per_child_spend', 'budget_per_child']
data_scaled[scalable_columns] = scaler.fit_transform(data_scaled[scalable_columns])
print(data_scaled[scalable_columns].describe())

#check categorical variables are set as categories
data['ofsted_overall'] =data['ofsted_overall'].astype("category")
data['ofsted_overall_filled'] =data['ofsted_overall_filled'].astype("category")
data['ofsted_care'] =data['ofsted_care'].astype("category")
data['ofsted_care_filled'] =data['ofsted_care_filled'].astype("category")
data['ofsted_leaders'] =data['ofsted_leaders'].astype("category")
data['ofsted_leaders_filled'] =data['ofsted_leaders_filled'].astype("category")
data['ofsted_help_protection'] =data['ofsted_help_protection'].astype("category")
data['ofsted_help_protection_filled'] =data['ofsted_help_protection_filled'].astype("category")

       number_in_care  number_in_care_filled  pt_placed_inside_la  \
count      750.000000             750.000000           747.000000   
mean         0.246364               0.246364             0.613078   
std          0.165839               0.165839             0.189203   
min          0.000000               0.000000             0.000000   
25%          0.140736               0.140736             0.482759   
50%          0.202375               0.202375             0.632184   
75%          0.303919               0.303919             0.747126   
max          1.000000               1.000000             1.000000   

       pt_private_provis  pt_3_more_placemts  pt_placed_inside_la_filled  \
count         747.000000          739.000000                  750.000000   
mean            0.361365            0.515785                    0.611913   
std             0.153186            0.140677                    0.189822   
min             0.000000            0.000000                    0.000000  

# Mask the data to avoid missing values

In [48]:
#this is currently not ideal as we have so many rows with a missing value, hopefully should be fine after imputation
mask = data.notnull().all(axis =1)
data = data[mask]

# Split into testing and training data

In [49]:
train_data, test_data = train_test_split(data, test_size = 0.25, random_state=1, shuffle = True)


# Linear regressions

# KS4 OUTCOME

Modelling pt_ks4_expectations on budget, la_name, reporting year, pt 3 or more placements, pt private provision, pt placed inside la

In [62]:
model1 = smf.ols('pt_ks4_expectations ~ budget_per_child + number_in_care  + ofsted_overall +ofsted_leaders +ofsted_care +ofsted_help_protection+ reporting_year + pt_3_more_placemts +pt_private_provis +pt_placed_inside_la ', data = train_data).fit()
print(model1.summary())
#would use mse here from sklearn but its struggling with missing values
mse = ((model1.predict(test_data) - test_data['pt_ks4_expectations'])**2).mean()
mse

                             OLS Regression Results                            
Dep. Variable:     pt_ks4_expectations   R-squared:                       0.825
Model:                             OLS   Adj. R-squared:                  0.228
Method:                  Least Squares   F-statistic:                     1.382
Date:                 Wed, 24 Apr 2024   Prob (F-statistic):              0.385
Time:                         14:08:50   Log-Likelihood:                -50.320
No. Observations:                   23   AIC:                             136.6
Df Residuals:                        5   BIC:                             157.1
Df Model:                           17                                         
Covariance Type:             nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Inte

180.29853913342328

Same as above, but without including reporting_year and la_name as predictors

In [63]:
model1 = smf.ols('pt_ks4_expectations ~  budget_per_child + number_in_care + ofsted_overall +ofsted_leaders +ofsted_care +ofsted_help_protection+ reporting_year + pt_3_more_placemts +pt_private_provis +pt_placed_inside_la ', data = train_data).fit()
print(model1.summary())
mse = ((model1.predict(test_data)- test_data['pt_ks4_expectations'])**2).mean()
mse

                             OLS Regression Results                            
Dep. Variable:     pt_ks4_expectations   R-squared:                       0.825
Model:                             OLS   Adj. R-squared:                  0.228
Method:                  Least Squares   F-statistic:                     1.382
Date:                 Wed, 24 Apr 2024   Prob (F-statistic):              0.385
Time:                         14:08:54   Log-Likelihood:                -50.320
No. Observations:                   23   AIC:                             136.6
Df Residuals:                        5   BIC:                             157.1
Df Model:                           17                                         
Covariance Type:             nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Inte

180.29853913342328

In [64]:
rmse = np.sqrt(mse)
standardised_rsme = rmse / np.std(test_data['pt_ks4_expectations'])

In [65]:
rmse

13.427529152209027

In [66]:
standardised_rsme

2.551333571025344

# KS2 OUTCOME

Repeat the above two models for other outcome variables: pt ks2 expectations, pt convicted and pt substance misuse problem.

In [67]:
#models here
model1 = smf.ols('pt_ks2_expectations ~  budget_per_child + number_in_care  + ofsted_overall +ofsted_leaders +ofsted_care +ofsted_help_protection+ reporting_year + pt_3_more_placemts +pt_private_provis +pt_placed_inside_la ', data = train_data).fit()
print(model1.summary())
mse = ((model1.predict(test_data) - test_data['pt_ks2_expectations'])**2).mean()
mse

                             OLS Regression Results                            
Dep. Variable:     pt_ks2_expectations   R-squared:                       0.778
Model:                             OLS   Adj. R-squared:                  0.022
Method:                  Least Squares   F-statistic:                     1.030
Date:                 Wed, 24 Apr 2024   Prob (F-statistic):              0.537
Time:                         14:09:05   Log-Likelihood:                -70.136
No. Observations:                   23   AIC:                             176.3
Df Residuals:                        5   BIC:                             196.7
Df Model:                           17                                         
Covariance Type:             nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Inte

366.5282585861903

In [68]:
model1 = smf.ols('pt_ks2_expectations ~  budget_per_child + number_in_care  + ofsted_overall +ofsted_leaders +ofsted_care +ofsted_help_protection+ reporting_year + pt_3_more_placemts +pt_private_provis +pt_placed_inside_la ', data = train_data).fit()
print(model1.summary())


                             OLS Regression Results                            
Dep. Variable:     pt_ks2_expectations   R-squared:                       0.778
Model:                             OLS   Adj. R-squared:                  0.022
Method:                  Least Squares   F-statistic:                     1.030
Date:                 Wed, 24 Apr 2024   Prob (F-statistic):              0.537
Time:                         14:09:14   Log-Likelihood:                -70.136
No. Observations:                   23   AIC:                             176.3
Df Residuals:                        5   BIC:                             196.7
Df Model:                           17                                         
Covariance Type:             nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Inte

In [69]:
rmse = np.sqrt(mse)
standardised_rsme = rmse / np.std(test_data['pt_ks2_expectations'])

In [70]:
rmse

19.14492775087413

In [71]:
standardised_rsme

2.2218027642089058

# CONVICTED OUTCOME

In [72]:
model1 = smf.ols('pt_convictions ~ budget_per_child + number_in_care  + ofsted_overall +ofsted_leaders +ofsted_care +ofsted_help_protection+ reporting_year + pt_3_more_placemts +pt_private_provis +pt_placed_inside_la ', data = train_data).fit()
print(model1.summary())
mse = ((model1.predict(test_data) - test_data['pt_convictions'])**2).mean()
mse

                            OLS Regression Results                            
Dep. Variable:         pt_convictions   R-squared:                       0.520
Model:                            OLS   Adj. R-squared:                 -1.113
Method:                 Least Squares   F-statistic:                    0.3184
Date:                Wed, 24 Apr 2024   Prob (F-statistic):              0.965
Time:                        14:09:24   Log-Likelihood:                -32.533
No. Observations:                  23   AIC:                             101.1
Df Residuals:                       5   BIC:                             121.5
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     

3.1083050800792016

In [73]:
model1 = smf.ols('pt_convictions ~  budget_per_child + number_in_care  + ofsted_overall +ofsted_leaders +ofsted_care +ofsted_help_protection+ reporting_year + pt_3_more_placemts +pt_private_provis +pt_placed_inside_la ', data = train_data).fit()
print(model1.summary())


                            OLS Regression Results                            
Dep. Variable:         pt_convictions   R-squared:                       0.520
Model:                            OLS   Adj. R-squared:                 -1.113
Method:                 Least Squares   F-statistic:                    0.3184
Date:                Wed, 24 Apr 2024   Prob (F-statistic):              0.965
Time:                        14:09:31   Log-Likelihood:                -32.533
No. Observations:                  23   AIC:                             101.1
Df Residuals:                       5   BIC:                             121.5
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     

In [74]:
rmse = np.sqrt(mse)
standardised_rsme = rmse / np.std(test_data['pt_convictions'])

In [75]:
rmse

1.763038592906917

In [76]:
standardised_rsme

1.2929398626564026

# SUBSTANCE OUTCOME

In [77]:
model1 = smf.ols('pt_substance ~  budget_per_child + number_in_care  + ofsted_overall +ofsted_leaders +ofsted_care +ofsted_help_protection+ reporting_year + pt_3_more_placemts +pt_private_provis +pt_placed_inside_la ', data = train_data).fit()
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:           pt_substance   R-squared:                       0.766
Model:                            OLS   Adj. R-squared:                 -0.030
Method:                 Least Squares   F-statistic:                    0.9622
Date:                Wed, 24 Apr 2024   Prob (F-statistic):              0.573
Time:                        14:09:42   Log-Likelihood:                -33.825
No. Observations:                  23   AIC:                             103.7
Df Residuals:                       5   BIC:                             124.1
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     

In [78]:
model1 = smf.ols('pt_substance ~ budget_per_child + number_in_care  + ofsted_overall +ofsted_leaders +ofsted_care +ofsted_help_protection+ reporting_year + pt_3_more_placemts +pt_private_provis +pt_placed_inside_la ', data = train_data).fit()
print(model1.summary())
mse = ((model1.predict(test_data) - test_data['pt_substance'])**2).mean()
mse

                            OLS Regression Results                            
Dep. Variable:           pt_substance   R-squared:                       0.766
Model:                            OLS   Adj. R-squared:                 -0.030
Method:                 Least Squares   F-statistic:                    0.9622
Date:                Wed, 24 Apr 2024   Prob (F-statistic):              0.573
Time:                        14:09:47   Log-Likelihood:                -33.825
No. Observations:                  23   AIC:                             103.7
Df Residuals:                       5   BIC:                             124.1
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Intercept     

11.773742784193956

In [79]:
rmse = np.sqrt(mse)
standardised_rsme = rmse / np.std(test_data['pt_substance'])

In [80]:
rmse

3.4312887934701672

In [81]:
standardised_rsme

2.02918533033696