In [5]:
import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from random import randint
from start import data_path

In [6]:
data = pd.read_csv(os.path.join(data_path,'doi_dates_scraped.csv'), sep=",")
len(data)

824

There are two ways we can determine dates - the term put forward in the plan (this is almost always just a school year) or the date a plan was finalized. We prefer the term date. However, when the term is not notes, we count the implementation date as the latest date in the DOI timeline (hopefully, the date the plan was adopted - finalize_year).

In [7]:
data[['title', 'term_phrase', 'term_year', 'term_month', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']].head()

Unnamed: 0,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,"(999, '')","hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,"(2018, 'September')","go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,"(999, '')","27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,"(999, '')",,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,"(999, '')","of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...


In [8]:
# Scraped links
fixdates_scraped = data[['title', 'term_phrase', 'term_year',
                        'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates_scraped = fixdates_scraped.rename(columns = 
                                           {'term_year': 'term_year_scraped',
                                            'finalize_year': 'finalize_year_scraped',
                                            'finalize_month': 'finalize_month_scraped'})
fixdates_scraped

Unnamed: 0,title,term_phrase,term_year_scraped,finalize_phrase,finalize_year_scraped,finalize_month_scraped,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,"hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,"go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,"27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,"of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...
5,Yoakum ISD,,-999,,-999,,http://www.yoakumisd.net/cms/lib3/TX01001553/C...
6,Yantis ISD,The District of Innovation Plan will become ef...,2017,"presented to the SBDM team on June, 2018. Yant...",2018,June,http://www.yantisisd.net/users/2017-2018/Distr...
7,Wylie ISD (221912),,-999,,-999,,http://www.wyliebulldogs.org/cms/One.aspx?port...
8,Wylie ISD (043914),INNOVATION PLAN INTRODUCTION House Bill (HB) 1...,-999,". The Committee met on December 14, 2016 and J...",2017,March,http://www.wylieisd.net/cms/lib09/TX01918453/C...
9,Wortham ISD,W Wortham ISD District of Innovation Plan 2017...,2017,,-999,,https://s3.amazonaws.com/scschoolfiles/888/wis...


In [9]:
# Manual links
fixdates_manual = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                              encoding = 'latin-1')
fixdates_manual = fixdates_manual[['title','corrected', 'term_year',
                                   'finalize_year', 'finalize_month']]
print(len(fixdates_manual))
fixdates_manual.sample(10)

824


Unnamed: 0,title,corrected,term_year,finalize_year,finalize_month
783,Axtell ISD,1.0,2017.0,2017.0,May
381,Lindale ISD,1.0,,2017.0,February
710,Cameron ISD,1.0,2018.0,2018.0,March
702,Carrollton Farmers Branch ISD,1.0,2018.0,,
301,Muleshoe ISD,0.0,2017.0,2017.0,March
50,Warren ISD,1.0,2017.0,2017.0,May
693,Centerville ISD (145902),1.0,2017.0,2017.0,April
583,Evadale ISD,1.0,2018.0,,
262,Paint Creek ISD,2.0,2017.0,,
814,Aledo ISD,1.0,,2019.0,February


In [10]:
# Merge scraped with manual
fixdates = fixdates_scraped.merge(fixdates_manual, how = 'left', 
                                  left_on = 'title', right_on = 'title')

fixdates.loc[fixdates.corrected > 0, 'term_year'] = fixdates_manual.term_year
fixdates.loc[fixdates.corrected == 0, 'term_year'] = fixdates_scraped.term_year_scraped
fixdates.loc[fixdates.corrected > 0, 'finalize_year'] = fixdates_manual.finalize_year
fixdates.loc[fixdates.corrected == 0, 'finalize_year'] = fixdates_scraped.finalize_year_scraped
fixdates.loc[fixdates.corrected > 0, 'finalize_month'] = fixdates_manual.finalize_month
fixdates.loc[fixdates.corrected == 0, 'finalize_month'] = fixdates_scraped.finalize_month_scraped

fixdates = fixdates[['corrected',  'title', 'term_phrase', 'term_year', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates.to_csv(os.path.join(data_path,'fix_dates.csv'))
fixdates.sample(10)

Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,link
488,0.0,Hays CISD,District of Innovation Plan Effective December...,2018.0,District of Innovation Plan Effective December...,2018.0,December,https://www.hayscisd.net/cms/lib/TX02204837/Ce...
215,1.0,Prosper ISD,"five years, beginning at the start of the 2017...",2017.0,"November 28, 2016 • December 12, 2016 • Januar...",2017.0,January,http://www.prosper-isd.net/cms/lib5/TX01918217...
153,1.0,Scurry-Rosser ISD,,2017.0,,2017.0,February,http://www.scurry-rosser.com/Common/News2/Home...
599,1.0,Ector County ISD,term of the Plan will begin with the 2017-2018...,2017.0,"final Local Innovation Plan on April 11, 2017*...",2017.0,April,http://www.ectorcountyisd.org/cms/lib011/tx010...
412,0.0,La Gloria ISD,"Plan July 25, 2018 Notify Commissioner August ...",2018.0,"School Board Votes to Approve Plan July 25, 20...",2018.0,August,http://www.lagloriaisd.esc2.net/pdf/La%20Glori...
222,1.0,Pottsboro ISD,,2017.0,,2017.0,March,https://www.pottsboroisd.org/district-of-innov...
340,1.0,Maypearl ISD,,,,2017.0,April,http://www.maypearlisd.org/cms/lib6/TX01918083...
543,0.0,Garrison ISD,ISD’s Innovation Plan will begin with the 2019...,2019.0,"public input for 30 days February 25, 2019 GIS...",2019.0,February,http://www.garrisonisd.com/uploads/1/3/7/6/137...
202,0.0,Rice CISD,Beginning with the start of the 2018-2019 scho...,2018.0,District of Innovation approved by board. Augu...,2018.0,August,http://www.ricecisd.org/upload/page/0035/docs/...
115,0.0,Stephenville ISD,in effect for five years beginning in January ...,2017.0,the SISD District of Innovation Plan. ▸ Januar...,2017.0,January,https://1.cdn.edl.io/RvEnFuQMk19kWAtfNLoJR4h6K...


### Check term and finalize year and month. Edit if incorrect if incorrectly missing. Then change corrected to 1 if corrected, zero if correct (2 if corrected from TEA). Delete -999 if correctly missing. 

In [11]:
fixed_dates = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                          encoding = 'latin-1')

In [12]:
value = randint(0, 824)
value

682

In [13]:
# Ensure order does not change between data and fixed_dates

In [14]:
fixed_dates.loc[value].title

'China Spring ISD'

In [15]:
data.loc[value].title

'China Spring ISD'

In [16]:
# If order does not change,
fixed_dates['title'] = data.title

In [17]:
fixed_dates.loc[fixed_dates.term_year == -999, 'term_year'] = np.nan

print(fixed_dates.term_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.term_year)])

2017.0    496
2018.0    115
2016.0     48
2019.0     19
2015.0      1
Name: term_year, dtype: int64


145

In [18]:
fixed_dates.loc[fixed_dates.finalize_year == -999, 'finalize_year'] = np.nan

print(fixed_dates.finalize_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.finalize_year)])

2017.0    484
2018.0    126
2016.0     58
2019.0     17
Name: finalize_year, dtype: int64


139

Set doi_year as term year if available. Otherwise, finalize year

In [20]:
fixed_dates['doi_year'] = np.nan
fixed_dates.loc[pd.notnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.term_year
fixed_dates.loc[pd.isnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.finalize_year
fixed_dates.sample(10)

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,link,doi_year
309,309,1.0,Morton ISD,. The Plan will take effect for the 2017-2018 ...,2017.0,"the Texas Education Code. On January 19, 2017 ...",2017.0,April,https://drive.google.com/file/d/0ByTbbvh_1OW_c...,2017.0
507,507,1.0,Hallettsville ISD,HJISD District of Innovation 2017-2021 HALLETT...,2017.0,"the HISD District of Innovation Plan. Monday, ...",2017.0,December,https://1.cdn.edl.io/FvsO0HQMGiiTwMHhA1rbf3N0m...,2017.0
784,784,1.0,Avinger ISD,The District of Innovation Plan will become ef...,2017.0,"to Commissioner of Education ï£ May 11, 2017 ...",2017.0,May,https://s3.amazonaws.com/scschoolfiles/887/dis...,2017.0
336,336,0.0,McLean ISD,", Principal District of Innovation Plan 2017 â...",2017.0,"the Texas Education Code. On January 16, 2017,...",2017.0,January,http://www.mcleanisd.com/vimages/shared/vnews/...,2017.0
445,445,1.0,Jayton-Girard ISD,"is for five years, beginning in August 2017 an...",2017.0,"Board Final Vote of Acceptance April 25, 2017 ...",2017.0,April,https://4.files.edl.io/95e6/11/01/18/153754-f1...,2017.0
554,554,1.0,Friendswood ISD,"which was adopted and implemented in August, 2...",2017.0,"for five years, beginning February 14, 2017, p...",2017.0,February,http://myfisd.com/wp-content/uploads/2017/02/F...,2017.0
587,587,1.0,Era ISD,ERA ISD DISTRICT OF INNOVATION PLAN 2016-2021 ...,2016.0,"Era ISD Website. â¢ March 18, 2019 Board of T...",2016.0,July,http://www.eraisd.net/site/handlers/filedownlo...,2016.0
370,370,1.0,Lometa ISD,,2018.0,,2018.0,January,https://drive.google.com/file/d/1DqPY5G88CnmpB...,2018.0
267,267,1.0,Olton ISD,,2017.0,,2017.0,February,http://www.oltonisd.net/vimages/shared/vnews/s...,2017.0
18,18,2.0,Windthorst ISD,,2017.0,,,,http://www.windthorstisd.net/Uploads/50/misc/f...,2017.0


In [21]:
print('District not missing doi year:', len(fixed_dates[pd.notnull(fixed_dates.doi_year)]))
fixed_dates.doi_year.value_counts()

District not missing doi year: 812


2017.0    586
2018.0    133
2016.0     69
2019.0     23
2015.0      1
Name: doi_year, dtype: int64

## Save

In [22]:
fixed_dates[['title', 'doi_year']].to_csv(os.path.join(data_path, 'doi_dates.csv'))

In [23]:
len(fixed_dates)

824