In [1]:
import os
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from random import randint
from start import data_path

In [2]:
data = pd.read_csv(os.path.join(data_path,'doi_dates_scraped.csv'), sep=",")
len(data)

824

There are two ways we can determine dates - the term put forward in the plan (this is almost always just a school year) or the date a plan was finalized. We prefer the term date. However, when the term is not notes, we count the implementation date as the latest date in the DOI timeline (hopefully, the date the plan was adopted - finalize_year).

In [3]:
data[['title', 'term_phrase', 'term_year', 'term_month', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']].head()

Unnamed: 0,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,"(999, '')","hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,"(2018, 'September')","go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,"(999, '')","27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,"(999, '')",,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,"(999, '')","of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...


In [4]:
# Scraped links
fixdates_scraped = data[['title', 'term_phrase', 'term_year',
                        'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates_scraped = fixdates_scraped.rename(columns = 
                                           {'term_year': 'term_year_scraped',
                                            'finalize_year': 'finalize_year_scraped',
                                            'finalize_month': 'finalize_month_scraped'})
fixdates_scraped

Unnamed: 0,title,term_phrase,term_year_scraped,finalize_phrase,finalize_year_scraped,finalize_month_scraped,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,"hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,"go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,"27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,"of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...
...,...,...,...,...,...,...,...
819,Adrian ISD,term of up to five years beginning June 2018 a...,2018,,-999,,http://www.adrianisd.net/UserFiles/Servers/Ser...
820,Academy ISD,2018-2019 school year and concluding at the en...,2018,"regularly scheduled board meeting on June 28, ...",2018,June,https://4.files.edl.io/1a8f/06/29/18/204245-44...
821,Abilene ISD,ABILENE INDEPENDENT SCHOOL DISTRICT Local Inno...,2018,"the AISD Board of Trustees April 9, 2018 1 Boa...",2018,April,https://www.abileneisd.org/wp-content/uploads/...
822,Abernathy ISD,of Innovation Plan – Draft Introduction House ...,-999,"the Texas Education Code. On Thursday, January...",2017,January,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...


In [5]:
# Manual links
fixdates_manual = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                              encoding = 'latin-1')
fixdates_manual = fixdates_manual[['title','corrected', 'term_year',
                                   'finalize_year', 'finalize_month']]
print(len(fixdates_manual))
fixdates_manual.sample(10)

824


Unnamed: 0,title,corrected,term_year,finalize_year,finalize_month
299,Mumford ISD,1,2017.0,,
578,Fairfield ISD,1,,2017.0,June
647,Corrigan-Camden ISD,1,2018.0,2018.0,May
762,Bland ISD,1,2017.0,2017.0,April
442,Jim Ned CISD,1,2019.0,2019.0,March
772,Bellevue ISD,1,2017.0,2017.0,April
257,Palo Pinto ISD,1,2017.0,2017.0,April
363,Lovelady ISD,1,2018.0,,
513,Groesbeck ISD,1,2019.0,2019.0,February
298,Munday CISD,1,2017.0,2017.0,March


In [6]:
# Merge scraped with manual
fixdates = fixdates_scraped.merge(fixdates_manual, how = 'left', 
                                  left_on = 'title', right_on = 'title')

fixdates.loc[fixdates.corrected > 0, 'term_year'] = fixdates_manual.term_year
fixdates.loc[fixdates.corrected == 0, 'term_year'] = fixdates_scraped.term_year_scraped
fixdates.loc[fixdates.corrected > 0, 'finalize_year'] = fixdates_manual.finalize_year
fixdates.loc[fixdates.corrected == 0, 'finalize_year'] = fixdates_scraped.finalize_year_scraped
fixdates.loc[fixdates.corrected > 0, 'finalize_month'] = fixdates_manual.finalize_month
fixdates.loc[fixdates.corrected == 0, 'finalize_month'] = fixdates_scraped.finalize_month_scraped

fixdates = fixdates[['corrected',  'title', 'term_phrase', 'term_year', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates.to_csv(os.path.join(data_path,'fix_dates.csv'))
fixdates.sample(10)

Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,link
8,1.0,Wylie ISD (043914),INNOVATION PLAN INTRODUCTION House Bill (HB) 1...,2017.0,". The Committee met on December 14, 2016 and J...",,March,http://www.wylieisd.net/cms/lib09/TX01918453/C...
602,1.0,East Bernard ISD,"August 1, 2017 and ending July 31, 2022, unles...",2017.0,"Adopted by EBISD Board: March 6, 2017 East Ber...",2017.0,March,https://1.cdn.edl.io/m2Y7mQEE9cI2UzC82atSqsOMQ...
616,1.0,Devers ISD,"will be in effect from July 1, 2017 through Ju...",2017.0,,,,https://drive.google.com/file/d/0B0nU_nqsdRSKZ...
21,1.0,Willis ISD,,,,2017.0,April,https://tx50000123.schoolwires.net//cms/lib/TX...
473,1.0,Honey Grove ISD,in effect for the next five years (2017-2022)....,2017.0,for the next five years (2017-2022). Timeline ...,2017.0,February,https://s3.amazonaws.com/scschoolfiles/1549/di...
67,1.0,Uvalde CISD,,2017.0,,2017.0,March,https://www.ucisd.net/domain/82
670,1.0,Clifton ISD,2018-2019 school year and concluding at the en...,2018.0,,,,https://1.cdn.edl.io/gwZxglY39ZfMkIchj28gRYDXK...
816,1.0,Albany ISD,The Local Innovation Plan Committee met Februa...,2017.0,"Local Innovation Plan at the March 20, 2017 re...",2017.0,March,http://www.albanyisd.net/uploads/4/4/4/1/44419...
757,1.0,Blum ISD,BLUM Independent School District District of I...,2017.0,"to the Commissioner of Education May 15, 2017 ...",2017.0,May,http://blumisd.net/UserFiles/Servers/Server_77...
762,1.0,Bland ISD,,2017.0,,2017.0,April,https://drive.google.com/file/d/0B0CSbOD5Zf1md...


### Check term and finalize year and month. Edit if incorrect if incorrectly missing. Then change corrected to 1 if corrected, zero if correct (2 if corrected from TEA). Delete -999 if correctly missing. 

In [7]:
fixed_dates = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                          encoding = 'latin-1')

In [8]:
value = randint(0, 824)
value

267

In [9]:
# Ensure order does not change between data and fixed_dates

In [10]:
fixed_dates.loc[value].title

'Olton ISD'

In [11]:
data.loc[value].title

'Olton ISD'

In [12]:
# If order does not change,
fixed_dates['title'] = data.title

In [13]:
fixed_dates.loc[fixed_dates.term_year == -999, 'term_year'] = np.nan

print(fixed_dates.term_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.term_year)])

2017.0    491
2018.0    115
2016.0     46
2019.0     18
2015.0      1
Name: term_year, dtype: int64


153

In [14]:
fixed_dates.loc[fixed_dates.finalize_year == -999, 'finalize_year'] = np.nan

print(fixed_dates.finalize_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.finalize_year)])

2017.0    484
2018.0    125
2016.0     60
2019.0     17
Name: finalize_year, dtype: int64


138

Set doi_year as term year if available. Otherwise, finalize year

In [15]:
fixed_dates['doi_year'] = np.nan
fixed_dates.loc[pd.notnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.term_year
fixed_dates.loc[pd.isnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.finalize_year
fixed_dates.sample(10)

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link,doi_year
54,54,1,Waco ISD,,2017.0,March,,2017.0,March,https://www.wacoisd.org/doi,2017.0
341,341,1,May ISD,District of Innovation Plan Start Date August ...,2017.0,August,"the May ISD Board of Trustees. On March 26, 20...",2017.0,May,http://toolbox1.s3-website-us-west-2.amazonaws...,2017.0
198,198,1,Riesel ISD,,2017.0,April,,2017.0,April,https://www.rieselisd.org/cms/lib/TX02215234/C...,2017.0
242,242,1,Perryton ISD,,2017.0,August,,2017.0,April,http://s3.amazonaws.com/scschoolfiles/1204/doi...,2017.0
403,403,1,Lago Vista ISD,"Therefore, this plan will begin with the 2017-...",2017.0,June,"approved the final plan on March 22, 2017. On ...",2017.0,,http://www.lagovistaisd.net/upload/page/0030/d...,2017.0
723,723,1,Burkburnett ISD,District of Innovation Plan February 2017 - Fe...,2017.0,February,District of Innovation Plan February 2017 - Fe...,2017.0,February,https://1.cdn.edl.io/NmmgpAdINIeQ8MlcQQx6MB0zP...,2017.0
514,514,1,Gregory-Portland ISD,,,,,,,http://www.g-pisd.org/uploaded/_District/Publi...,
176,176,1,Royse City ISD,"before the 4th Monday of August. For 2017-18, ...",2017.0,,"proposed plan. Monday, January 23, 2017, Speci...",2017.0,January,https://www.rcisd.org/doi/,2017.0
495,495,1,Harrold ISD,"a new teacher appraisal system in 2016-2017, c...",,,"of Innovation Plan Approved / March 30, 2017 I...",2017.0,March,https://harroldisd.socs.net/vimages/shared/vne...,2017.0
584,584,1,Eustace ISD,its recommendation for teacher evaluation begi...,,,"started at 4:00 PM. February 14, 2017 â A pu...",2017.0,February,https://s3.amazonaws.com/scschoolfiles/1670/di...,2017.0


In [16]:
print('District not missing doi year:', len(fixed_dates[pd.notnull(fixed_dates.doi_year)]))
fixed_dates.doi_year.value_counts()

District not missing doi year: 812


2017.0    588
2018.0    133
2016.0     68
2019.0     22
2015.0      1
Name: doi_year, dtype: int64

## Save

In [19]:
fixed_dates[['title', 'term_year', 'term_month', 'finalize_year', 'finalize_month']].to_csv(os.path.join(data_path, 'doi_dates.csv'))

In [18]:
len(fixed_dates)

824