In [1]:
import pathlib

import numpy as np
import pandas as pd

import patsy
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error 

In [2]:
cwd = pathlib.Path.cwd()
data_path = cwd / 'data'
pkl_path = data_path / 'pkl'
observations_pkl = pkl_path /'observations.pkl'

df = pd.read_pickle(observations_pkl).astype(float)

All raw features excluding multicollinear ones such as Percent_Below_Standard.  
Just keep Percentage_Standard_Met_and_Above

In [3]:
raw = ['Test_Year', 'Total_Tested_At_Entity_Level', 'Total_Tested_with_Scores',
       'Grade', 'Test_Id', 'CAASPP_Reported_Enrollment', 'Students_Tested',
       'Mean_Scale_Score', 'Percentage_Standard_Met_and_Above','Type_Id', 'Zip']

### Uncomment out respective naive implementations to see results

In [4]:
#: Naive OLS model of throwing all original features at target
y, X = patsy.dmatrices((f'Median_Income ~ {" + ".join(raw)} + 0'), data=df)

#: Naive Simple Linear Regression with just Mean Score
# y, X = patsy.dmatrices('np.log(Median_Income) ~ Mean_Scale_Score + 0', data=df)

#: Naive Simple Linear Regression with just Median Income
# y, X = patsy.dmatrices('np.log(Median_Income) ~ Percentage_Standard_Met_and_Above + 0', data=df)

linreg = LinearRegression()
fit = linreg.fit(X, y)
r_squared = linreg.score(X, y)
adjusted_r_squared = 1 - (1 - r_squared) * len(y) / (len(y) - X.shape[1] - 1)
rmse = np.sqrt(mean_squared_error(y, fit.predict(X)))

y_df = pd.DataFrame(y, columns=y.design_info.column_names)
X_df = pd.DataFrame(X, columns=X.design_info.column_names)
interactions = pd.concat([X_df, y_df], axis=1)

print('Intercept:', fit.intercept_)
print('R^2 Score:', r_squared)
print('Adj R^2 Score:', adjusted_r_squared)
print('RMSE Score:', rmse)

Intercept: [-157671.56813038]
R^2 Score: 0.3192627946174227
Adj R^2 Score: 0.3191192700116072
RMSE Score: 20189.28668419213


In [5]:
lm = sm.OLS(y, sm.add_constant(X))
fit = lm.fit()
fit.summary()

0,1,2,3
Dep. Variable:,Median_Income,R-squared:,0.319
Model:,OLS,Adj. R-squared:,0.319
Method:,Least Squares,F-statistic:,2669.0
Date:,"Fri, 25 Jan 2019",Prob (F-statistic):,0.0
Time:,15:29:25,Log-Likelihood:,-645100.0
No. Observations:,56928,AIC:,1290000.0
Df Residuals:,56917,BIC:,1290000.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Test_Year,-78.1326,4.809,-16.246,0.000,-87.559,-68.706
Total_Tested_At_Entity_Level,201.6205,43.434,4.642,0.000,116.490,286.751
Total_Tested_with_Scores,-202.8167,43.493,-4.663,0.000,-288.063,-117.570
Grade,-815.4109,94.785,-8.603,0.000,-1001.190,-629.632
Test_Id,5379.5464,182.553,29.468,0.000,5021.742,5737.351
CAASPP_Reported_Enrollment,240.0734,12.930,18.567,0.000,214.730,265.416
Students_Tested,-228.7238,13.214,-17.310,0.000,-254.622,-202.825
Mean_Scale_Score,40.0326,3.893,10.284,0.000,32.403,47.662
Percentage_Standard_Met_and_Above,5.337e+04,978.592,54.534,0.000,5.14e+04,5.53e+04

0,1,2,3
Omnibus:,8407.11,Durbin-Watson:,0.301
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19727.684
Skew:,0.858,Prob(JB):,0.0
Kurtosis:,5.319,Cond. No.,1090000.0
