In [17]:
import sys
sys.path.append("../")


import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from patsy import dmatrices
from openpyxl import load_workbook
import engarde.decorators as ed

from linearmodels import PanelOLS

from library import test_data
@ed.verify(test_data.allyearsandtpsd)
@ed.verify(test_data.alldois)
def load(data):
    return data

from library import print_statistics

In [18]:
def coef_with_stars(coef, pvalue):
    if pvalue >.05:
        coef = str(coef)
    if pvalue <= .05:
        coef = str(coef) + '*'
    if pvalue <= .01:
        coef = coef + '*'
    if pvalue <= .001:
        coef = coef + '*'
    return(coef)
test = coef_with_stars(.1, .005)

In [19]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
table_path = '/Users/kylieleblancKylie/domino/dofis/results/impact/'
data = pd.read_csv(os.path.join(data_path, 'clean', 'gdid.csv'),
                  sep=",", low_memory= False)
#load(data)
data.tail()
print(data[data.doi == True].district.nunique())
data = data[data.doi == True]
data.sample()

824


Unnamed: 0.1,Unnamed: 0,year,campus,campname,campischarter,district,distname,distischarter,rating_academic,rating_financial,...,type_urban,type_suburban,type_town,type_rural,eligible,teachers_nodegree,teachers_badegree,teachers_msdegree,teachers_phddegree,treatpost
31908,31908,2015,169901041,BOWIE J H,N,169901,BOWIE ISD,N,M,Pass,...,0,0,1,0,1,0.0,0.843416,0.156584,0.0,False


In [20]:
list(data.columns)

['Unnamed: 0',
 'year',
 'campus',
 'campname',
 'campischarter',
 'district',
 'distname',
 'distischarter',
 'rating_academic',
 'rating_financial',
 'rating_academic_c',
 'type',
 'type_description',
 'cntyname_c',
 'students_amind_num',
 'students_asian_num',
 'students_black_num',
 'students_frpl_num',
 'students_hisp_num',
 'students_num',
 'students_paci_num',
 'students_tworaces_num',
 'students_white_num',
 'teachers_badegree_num',
 'teachers_exp_ave',
 'teachers_msdegree_num',
 'teachers_new_num',
 'teachers_nodegree_num',
 'teachers_num',
 'teachers_phddegree_num',
 'teachers_tenure_ave',
 'alg_avescore',
 'bio_avescore',
 'eng1_avescore',
 'eng2_avescore',
 'm_3rd_avescore',
 'm_4th_avescore',
 'm_5th_avescore',
 'm_6th_avescore',
 'm_7th_avescore',
 'm_8th_avescore',
 'r_3rd_avescore',
 'r_4th_avescore',
 'r_5th_avescore',
 'r_6th_avescore',
 'r_7th_avescore',
 'r_8th_avescore',
 's_8th_avescore',
 'us_avescore',
 'alg_numtakers',
 'bio_numtakers',
 'eng1_numtakers',
 'eng

In [21]:
data['yearpost'] = data.year - data.doi_year
data.yearpost.value_counts()

 0.0    6347
-1.0    6330
-2.0    6294
-3.0    6276
-4.0    6264
 1.0    6130
-5.0    5357
 2.0    5191
-6.0    1152
 3.0     938
-7.0     201
Name: yearpost, dtype: int64

In [22]:
#convert year to datetime
df = data.reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
#add column year to index
df = data.set_index(['year', 'campus'])
#swap indexes
df.index = df.index.swaplevel(0,1)
df[['district', 'doi_year','treatpost',]].tail(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,district,doi_year,treatpost
campus,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
253901105,2019,253901,2018.0,True
253901106,2019,253901,2018.0,True
253901107,2019,253901,2018.0,True
254902001,2019,254902,2018.0,True
254902101,2019,254902,2018.0,True


In [24]:
mod = PanelOLS.from_formula('avescores ~ + 1 + treatpost + students_hisp + students_num + TimeEffects + EntityEffects', df)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res)

Inputs contain missing values. Dropping rows with missing observations.


                          PanelOLS Estimation Summary                           
Dep. Variable:              avescores   R-squared:                        0.0050
Estimator:                   PanelOLS   R-squared (Between):              0.1334
No. Observations:               45962   R-squared (Within):              -0.0245
Date:                Wed, Oct 09 2019   R-squared (Overall):              0.1189
Time:                        18:25:58   Log-likelihood                   -4541.7
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      67.058
Entities:                        6121   P-value                           0.0000
Avg Obs:                       7.5089   Distribution:                 F(3,39831)
Min Obs:                       1.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             15.556
                            

# Heterogenous effects

In [25]:
### Generate variable for number of hispanic students in year before the district declares

In [26]:
data_pre = data.loc[data.yearpost == -1]
data_pre = data_pre.rename(columns = {'students_hisp': 'students_hisp_pre'})
data_pre['low_avescores_pre'] = np.where(data_pre.avescores < data_pre.avescores.quantile(.25),1, 0)
data_pre['high_avescores_pre'] = np.where(data_pre.avescores > data_pre.avescores.quantile(.75),1,0)
data_pre = data_pre[['campus','students_hisp_pre', 'low_avescores_pre', 'high_avescores_pre']]
data_hte = data.merge(data_pre, on = 'campus', how = 'left')
data_hte['majority_hisp_pre'] = np.where(data_hte.students_hisp_pre > .75, 1, 0)
#df_hte[['students_hisp', 'students_hisp_pre', 'yearpost']].sample(5)

In [27]:
data_pre = data.loc[data.year == 2016]
data_pre['rural_pre'] = np.where(data_pre.type_description == "RURAL", 1, 0)
data_pre['urban_pre'] = np.where(data_pre.type_description == "URBAN", 1, 0)
data_pre = data_pre[['campus','rural_pre', 'urban_pre']]
data_hte = data_hte.merge(data_pre, on = 'campus', how = 'left')
#df_hte[['students_hisp', 'students_hisp_pre', 'yearpost']].sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [28]:
#convert year to datetime
df_hte = data_hte.reset_index()
df_hte['year'] = pd.to_datetime(df_hte['year'], format='%Y')
#add column year to index
df_hte = data_hte.set_index(['year', 'campus'])
#swap indexes
df_hte.index = df_hte.index.swaplevel(0,1)
df_hte[['district', 'doi_year','treatpost',]].tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,district,doi_year,treatpost
campus,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
251901101,2019,251901,2017.0,True
251901104,2019,251901,2017.0,True
252901001,2019,252901,2017.0,True
252901002,2019,252901,2017.0,True
252901041,2019,252901,2017.0,True
252901101,2019,252901,2017.0,True
252901104,2019,252901,2017.0,True
252901105,2019,252901,2017.0,True
252902002,2019,252902,2017.0,True
252903001,2019,252903,2017.0,True


In [31]:
df_hte['treatpost_hisp'] = df_hte.treatpost * df_hte.majority_hisp_pre
mod = PanelOLS.from_formula('avescores ~ + 1 + treatpost + treatpost_hisp + students_hisp + students_num + TimeEffects + EntityEffects', df_hte)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res)

Inputs contain missing values. Dropping rows with missing observations.


                          PanelOLS Estimation Summary                           
Dep. Variable:              avescores   R-squared:                        0.0107
Estimator:                   PanelOLS   R-squared (Between):              0.1206
No. Observations:               45962   R-squared (Within):              -0.0219
Date:                Wed, Oct 09 2019   R-squared (Overall):              0.1061
Time:                        18:26:53   Log-likelihood                   -4410.1
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      107.78
Entities:                        6121   P-value                           0.0000
Avg Obs:                       7.5089   Distribution:                 F(4,39830)
Min Obs:                       1.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             31.479
                            

In [32]:
df_hte['treatpost_rural'] = df_hte.treatpost * df_hte.rural_pre
mod = PanelOLS.from_formula('avescores ~ + 1 + treatpost + treatpost_rural + students_hisp + students_num  + TimeEffects + EntityEffects', df_hte)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res)

Inputs contain missing values. Dropping rows with missing observations.


                          PanelOLS Estimation Summary                           
Dep. Variable:              avescores   R-squared:                        0.0051
Estimator:                   PanelOLS   R-squared (Between):              0.1352
No. Observations:               45349   R-squared (Within):              -0.0242
Date:                Wed, Oct 09 2019   R-squared (Overall):              0.1184
Time:                        18:26:56   Log-likelihood                   -4602.8
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      50.926
Entities:                        5870   P-value                           0.0000
Avg Obs:                       7.7256   Distribution:                 F(4,39468)
Min Obs:                       1.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             12.025
                            

In [33]:
df_hte['treatpost_low'] = df_hte.treatpost * df_hte.low_avescores_pre
mod = PanelOLS.from_formula('avescores ~ + 1 + treatpost + treatpost_low + students_hisp + students_num  + TimeEffects + EntityEffects', df_hte)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res)

Inputs contain missing values. Dropping rows with missing observations.


                          PanelOLS Estimation Summary                           
Dep. Variable:              avescores   R-squared:                        0.0054
Estimator:                   PanelOLS   R-squared (Between):              0.1295
No. Observations:               44915   R-squared (Within):              -0.0286
Date:                Wed, Oct 09 2019   R-squared (Overall):              0.1122
Time:                        18:27:00   Log-likelihood                   -4575.1
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      52.599
Entities:                        5818   P-value                           0.0000
Avg Obs:                       7.7200   Distribution:                 F(4,39086)
Min Obs:                       1.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             12.700
                            

In [34]:
df_hte['treatpost_high'] = df_hte.treatpost * df_hte.high_avescores_pre
mod = PanelOLS.from_formula('avescores ~ + 1 + treatpost + treatpost_high+ students_hisp + students_num  + TimeEffects + EntityEffects', df_hte)
res = mod.fit(cov_type='clustered', cluster_entity=True)
print(res)

Inputs contain missing values. Dropping rows with missing observations.


                          PanelOLS Estimation Summary                           
Dep. Variable:              avescores   R-squared:                        0.0050
Estimator:                   PanelOLS   R-squared (Between):              0.1348
No. Observations:               44915   R-squared (Within):              -0.0286
Date:                Wed, Oct 09 2019   R-squared (Overall):              0.1175
Time:                        18:27:03   Log-likelihood                   -4583.2
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      49.075
Entities:                        5818   P-value                           0.0000
Avg Obs:                       7.7200   Distribution:                 F(4,39086)
Min Obs:                       1.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             11.855
                            