In [1]:
from collections import defaultdict
import pathlib

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import patsy

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [2]:
cwd = pathlib.Path.cwd()
data_path = cwd / 'data'
pkl_path = data_path / 'pkl'
observations_pkl = pkl_path /'observations.pkl'

df = pd.read_pickle(observations_pkl).astype(float)
df['Percent_Passed'] = df['Percentage_Standard_Met_and_Above']
df['Math'] = df['Test_Id'] - 1
df['Mean_Score'] = df['Mean_Scale_Score']
df['Percent_Tested'] = df['Students_Tested'] / df['CAASPP_Reported_Enrollment']
df['Participation_95'] = (df['Percent_Tested'] > 0.95).astype(int)
# df = df[['Median_Income', 'Percent_Passed', 'Math', 'Participation_95', 'Percent_Tested', 'Grade', 'Mean_Score']]

In [17]:
# df.columns

In [4]:
raw = ['Test_Year', 'Total_Tested_At_Entity_Level', 'Total_Tested_with_Scores',
       'Grade', 'Test_Id', 'CAASPP_Reported_Enrollment', 'Students_Tested',
       'Mean_Scale_Score', 'Percentage_Standard_Met_and_Above','Type_Id', 'Zip']

In [40]:
    # 19598         0.35835
# y, X = patsy.dmatrices(('Median_Income ~ Math * np.log(Grade) * Mean_Score * np.log(Percent_Passed + 1) + np.log(1.01-Percent_Tested) - Math:np.log(Grade):Mean_Score:np.log(Percent_Passed + 1) + 0'), data=df)
    # log 0.3102    0.31990
# y, X = patsy.dmatrices(('np.log(Median_Income) ~ Math * np.log(Grade) * Mean_Score * np.log(Percent_Passed + 1) + np.log(1.01-Percent_Tested) - Math:np.log(Grade):Mean_Score:np.log(Percent_Passed + 1) + 0'), data=df)
    # 19675         0.35325
# y, X = patsy.dmatrices(('Median_Income ~ Math * Grade * Mean_Score * Percent_Passed + Percent_Tested - Math:Grade:Mean_Score:Percent_Passe + 0'), data=df)
    # 20353         0.30807
# y, X = patsy.dmatrices(('Median_Income ~ Math + Grade + Mean_Score + Percent_Passed + Percent_Tested + 0'), data=df)
#     20189     0.31911
# y, X = patsy.dmatrices((f'Median_Income ~ {" + ".join(raw)} + 0'), data=df)

y, X = patsy.dmatrices('np.log(Median_Income) ~ Mean_Scale_Score + 0', data=df)

# y, X = patsy.dmatrices('np.log(Median_Income) ~ Percentage_Standard_Met_and_Above + 0', data=df)


linreg = LinearRegression()
fit = linreg.fit(X, y)
r_squared = linreg.score(X, y)
adjusted_r_squared = 1 - (1 - r_squared) * len(y) / (len(y) - X.shape[1] - 1)
rmse = np.sqrt(mean_squared_error(y, fit.predict(X)))


y_df = pd.DataFrame(y, columns=y.design_info.column_names)
X_df = pd.DataFrame(X, columns=X.design_info.column_names)
interactions = pd.concat([X_df, y_df], axis=1)

print('Intercept:', fit.intercept_)
print('R^2 Score:', r_squared)
print('Adj R^2 Score:', adjusted_r_squared)
print('RMSE Score:', rmse)

# predictions = fit.predict(X)
# residuals = y - fit.predict(X)
# plot = sns.scatterplot(x=predictions.ravel(), y=residuals.ravel(), alpha=0.2)
# plt.title('Residual Plot')
# plt.xlabel('Predictions')
# plt.ylabel('Residuals')

Intercept: [5.58585322]
R^2 Score: 0.14235400929418696
Adj R^2 Score: 0.1423238773337222
RMSE Score: 0.34850155128667776


In [41]:
import statsmodels.api as sm

lm = sm.OLS(y, sm.add_constant(X))
fit = lm.fit()
fit.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.142
Model:,OLS,Adj. R-squared:,0.142
Method:,Least Squares,F-statistic:,9449.0
Date:,"Fri, 25 Jan 2019",Prob (F-statistic):,0.0
Time:,08:37:45,Log-Likelihood:,-20769.0
No. Observations:,56928,AIC:,41540.0
Df Residuals:,56926,BIC:,41560.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.5859,0.055,100.965,0.000,5.477,5.694
x1,0.0022,2.22e-05,97.204,0.000,0.002,0.002

0,1,2,3
Omnibus:,28.927,Durbin-Watson:,0.203
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29.264
Skew:,-0.048,Prob(JB):,4.42e-07
Kurtosis:,3.056,Cond. No.,94300.0
