In [33]:
import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from random import randint
from start import data_path

In [2]:
data = pd.read_csv(os.path.join(data_path,'doi_dates_scraped.csv'), sep=",")
len(data)

824

There are two ways we can determine dates - the term put forward in the plan (this is almost always just a school year) or the date a plan was finalized. We prefer the term date. However, when the term is not notes, we count the implementation date as the latest date in the DOI timeline (hopefully, the date the plan was adopted - finalize_year).

In [3]:
data[['title', 'term_phrase', 'term_year', 'term_month', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']].head()

Unnamed: 0,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,"(999, '')","hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,"(2018, 'September')","go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,"(999, '')","27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,"(999, '')",,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,"(999, '')","of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...


In [6]:
# Scraped links
fixdates_scraped = data[['title', 'term_phrase', 'term_year',
                        'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates_scraped = fixdates_scraped.rename(columns = 
                                           {'term_year': 'term_year_scraped',
                                            'finalize_year': 'finalize_year_scraped',
                                            'finalize_month': 'finalize_month_scraped'})
fixdates_scraped

Unnamed: 0,title,term_phrase,term_year_scraped,finalize_phrase,finalize_year_scraped,finalize_month_scraped,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,"hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,"go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,"27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,"of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...
5,Yoakum ISD,,-999,,-999,,http://www.yoakumisd.net/cms/lib3/TX01001553/C...
6,Yantis ISD,The District of Innovation Plan will become ef...,2017,"presented to the SBDM team on June, 2018. Yant...",2018,June,http://www.yantisisd.net/users/2017-2018/Distr...
7,Wylie ISD (221912),,-999,,-999,,http://www.wyliebulldogs.org/cms/One.aspx?port...
8,Wylie ISD (043914),INNOVATION PLAN INTRODUCTION House Bill (HB) 1...,-999,". The Committee met on December 14, 2016 and J...",2017,March,http://www.wylieisd.net/cms/lib09/TX01918453/C...
9,Wortham ISD,W Wortham ISD District of Innovation Plan 2017...,2017,,-999,,https://s3.amazonaws.com/scschoolfiles/888/wis...


In [12]:
# Manual links
fixdates_manual = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                              encoding = 'latin-1')
fixdates_manual = fixdates_manual[['title','corrected', 'term_year',
                                   'finalize_year', 'finalize_month']]
print(len(fixdates_manual))
fixdates_manual.sample(10)

824


Unnamed: 0,title,corrected,term_year,finalize_year,finalize_month
771,Bells ISD,1.0,2017.0,,
50,Warren ISD,1.0,2017.0,2017.0,May
58,Vernon ISD,0.0,2017.0,2017.0,April
647,Corrigan-Camden ISD,1.0,2018.0,2018.0,May
454,Iowa Park CISD,1.0,2017.0,2017.0,March
138,Skidmore-Tynan ISD,1.0,,2016.0,April
754,Boling ISD,1.0,2019.0,2018.0,November
293,Nederland ISD,1.0,2017.0,2017.0,May
264,Overton ISD,1.0,,2017.0,April
137,Slaton ISD,1.0,2017.0,,


In [10]:
# Merge scraped with manual
fixdates = fixdates_scraped.merge(fixdates_manual, how = 'left', 
                                  left_on = 'title', right_on = 'title')

fixdates.loc[fixdates.corrected == 1, 'term_year'] = fixdates_manual.term_year
fixdates.loc[fixdates.corrected != 1, 'term_year'] = fixdates_scraped.term_year_scraped
fixdates.loc[fixdates.corrected == 1, 'finalize_year'] = fixdates_manual.finalize_year
fixdates.loc[fixdates.corrected != 1, 'finalize_year'] = fixdates_scraped.finalize_year_scraped
fixdates.loc[fixdates.corrected == 1, 'finalize_month'] = fixdates_manual.finalize_month
fixdates.loc[fixdates.corrected != 1, 'finalize_month'] = fixdates_scraped.finalize_month_scraped

fixdates = fixdates[['corrected',  'title', 'term_phrase', 'term_year', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates.to_csv(os.path.join(data_path,'fix_dates.csv'))
fixdates.sample(10)

Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,link
132,0.0,Snyder ISD,’s Local Innovation Plan will begin with the 2...,2018.0,"for consideration and approval. February 9, 20...",2018.0,February,https://1.cdn.edl.io/7b21ktMGjSuFIU9LdTLlk0PHz...
190,1.0,Robstown ISD,,2017.0,,,,https://1.cdn.edl.io/lMSBCMGlcKq6ndXrmyK2LZdW5...
522,1.0,Granbury ISD,,2017.0,,2017.0,January,http://www.granburyisd.org/innovation
507,1.0,Hallettsville ISD,HJISD District of Innovation 2017-2021 HALLETT...,2017.0,"the HISD District of Innovation Plan. Monday, ...",2017.0,December,https://1.cdn.edl.io/FvsO0HQMGiiTwMHhA1rbf3N0m...
719,1.0,Burton ISD,Burton Independent School Distr ict Distr ict ...,2017.0,. Appoint District of Innovation Committee. Ap...,2017.0,April,https://drive.google.com/file/d/0B4mAdWnHSvKsc...
667,1.0,Coahoma ISD,Monday in August. For the 2017-2018 school yea...,2017.0,CISD Board of Trustees Public Hearing - DOI No...,2017.0,March,https://core-docs.s3.amazonaws.com/documents/a...
820,0.0,Academy ISD,2018-2019 school year and concluding at the en...,2018.0,"regularly scheduled board meeting on June 28, ...",2018.0,June,https://4.files.edl.io/1a8f/06/29/18/204245-44...
398,1.0,Lancaster ISD,EC LEGAL & EB LEGAL) Currently: House Bill 261...,,or designee for approval. Connection to Vision...,2017.0,June,https://1.cdn.edl.io/137mVbKToE1MPNCk5H3SX2QEA...
381,1.0,Lindale ISD,,,,2017.0,February,https://docs.wixstatic.com/ugd/038cbd_b30fbd78...
596,1.0,Edgewood ISD (015905),"Monday of August. Under current law, for the 2...",,,,,http://www.edgewood-isd.net/upload/page/0257/d...


### Check term and finalize year and month. Edit if incorrect if incorrectly missing. Then change corrected to 1 if corrected, zero if correct. Delete -999 if correctly missing. 

In [37]:
fixed_dates = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                          encoding = 'latin-1')

In [38]:
value = randint(0, 824)
value

561

In [39]:
# Ensure order does not change between data and fixed_dates

In [40]:
fixed_dates.loc[value].title

'Fort Sam Houston ISD'

In [41]:
data.loc[value].title

'Fort Sam Houston ISD'

In [42]:
# If order does not change,
fixed_dates['title'] = data.title

In [43]:
fixed_dates.loc[fixed_dates.term_year == -999, 'term_year'] = np.nan

print(fixed_dates.term_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.term_year)])

2017.0    474
2018.0    112
2016.0     46
2019.0     18
2015.0      1
Name: term_year, dtype: int64


173

In [44]:
fixed_dates.loc[fixed_dates.finalize_year == -999, 'finalize_year'] = np.nan

print(fixed_dates.finalize_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.finalize_year)])

2017.0    484
2018.0    126
2016.0     58
2019.0     17
Name: finalize_year, dtype: int64


139

Set doi_year as term year if available. Otherwise, finalize year

In [46]:
fixed_dates['doi_year'] = np.nan
fixed_dates.loc[pd.notnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.term_year
fixed_dates.loc[pd.isnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.finalize_year
fixed_dates.sample(10)

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,doi_year
329,329,1.0,Mesquite ISD,,,,,,
443,443,1.0,Jim Hogg County ISD,be implemented in the proceeding academic year...,2017.0,"by Board of Trustees on June 21st, 2017 JHCISD...",2017.0,June,2017.0
204,204,1.0,Redwater ISD,Redwater Independent School District District ...,2017.0,of Redwater ISD Local Innovation Plan begins M...,2017.0,January,2017.0
789,789,1.0,Atlanta ISD,of Innovation Plan will become effective in Ma...,2017.0,"the District Site Based Committee on March 16,...",2017.0,March,2017.0
404,404,0.0,Lackland ISD,2017-2018 school year and ending in the 2021-2...,2017.0,Lackland Independent School District District ...,2017.0,March,2017.0
258,258,1.0,Palmer ISD,,2016.0,,,,2016.0
371,371,1.0,Lohn ISD,state issued a new teacher appraisal system in...,,,2017.0,June,2017.0
760,760,1.0,Blooming Grove ISD,"five years, in effect for the 2018-2019 school...",2018.0,the Blooming Grove ISD Board of Trustees: Augu...,2018.0,August,2018.0
81,81,0.0,Tom Bean ISD,Innovation Plan will become effective January ...,2018.0,"the DOIC on Wednesday, January 3, 2018. Tom Be...",2018.0,January,2018.0
336,336,0.0,McLean ISD,", Principal District of Innovation Plan 2017 â...",2017.0,"the Texas Education Code. On January 16, 2017,...",2017.0,January,2017.0


In [47]:
print('District not missing doi year:', len(fixed_dates[pd.notnull(fixed_dates.doi_year)]))
fixed_dates.doi_year.value_counts()

District not missing doi year: 784


2017.0    564
2018.0    130
2016.0     67
2019.0     22
2015.0      1
Name: doi_year, dtype: int64

## Save

In [48]:
fixed_dates[['title', 'doi_year']].to_csv(os.path.join(data_path, 'doi_dates.csv'))