In [1]:
import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from random import randint
from start import data_path

In [2]:
data = pd.read_csv(os.path.join(data_path,'doi_dates_scraped.csv'), sep=",")
len(data)

824

There are two ways we can determine dates - the term put forward in the plan (this is almost always just a school year) or the date a plan was finalized. We prefer the term date. However, when the term is not notes, we count the implementation date as the latest date in the DOI timeline (hopefully, the date the plan was adopted - finalize_year).

In [3]:
data[['title', 'term_phrase', 'term_year', 'term_month', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']].head()

Unnamed: 0,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,"(999, '')","hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,"(2018, 'September')","go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,"(999, '')","27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,"(999, '')",,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,"(999, '')","of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...


In [4]:
# Scraped links
fixdates_scraped = data[['title', 'term_phrase', 'term_year',
                        'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates_scraped = fixdates_scraped.rename(columns = 
                                           {'term_year': 'term_year_scraped',
                                            'finalize_year': 'finalize_year_scraped',
                                            'finalize_month': 'finalize_month_scraped'})
fixdates_scraped

Unnamed: 0,title,term_phrase,term_year_scraped,finalize_phrase,finalize_year_scraped,finalize_month_scraped,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,"hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,"go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,"27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,"of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...
5,Yoakum ISD,,-999,,-999,,http://www.yoakumisd.net/cms/lib3/TX01001553/C...
6,Yantis ISD,The District of Innovation Plan will become ef...,2017,"presented to the SBDM team on June, 2018. Yant...",2018,June,http://www.yantisisd.net/users/2017-2018/Distr...
7,Wylie ISD (221912),,-999,,-999,,http://www.wyliebulldogs.org/cms/One.aspx?port...
8,Wylie ISD (043914),INNOVATION PLAN INTRODUCTION House Bill (HB) 1...,-999,". The Committee met on December 14, 2016 and J...",2017,March,http://www.wylieisd.net/cms/lib09/TX01918453/C...
9,Wortham ISD,W Wortham ISD District of Innovation Plan 2017...,2017,,-999,,https://s3.amazonaws.com/scschoolfiles/888/wis...


In [5]:
# Manual links
fixdates_manual = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                              encoding = 'latin-1')
fixdates_manual = fixdates_manual[['title','corrected', 'term_year',
                                   'finalize_year', 'finalize_month']]
print(len(fixdates_manual))
fixdates_manual.sample(10)

824


Unnamed: 0,title,corrected,term_year,finalize_year,finalize_month
464,Hull-Daisetta ISD,1.0,,2017.0,May
525,Grady ISD,1.0,2019.0,2019.0,March
274,Nursery ISD,1.0,2017.0,2017.0,May
398,Lancaster ISD,1.0,,2017.0,June
283,Nixon-Smiley CISD,1.0,2018.0,2018.0,February
68,Utopia ISD,1.0,2018.0,2018.0,November
5,Yoakum ISD,1.0,2018.0,,
20,Wills Point ISD,1.0,2018.0,2018.0,January
194,Rivercrest ISD,1.0,2017.0,2017.0,January
818,Agua Dulce ISD,1.0,2017.0,2017.0,June


In [6]:
# Merge scraped with manual
fixdates = fixdates_scraped.merge(fixdates_manual, how = 'left', 
                                  left_on = 'title', right_on = 'title')

fixdates.loc[fixdates.corrected == 1, 'term_year'] = fixdates_manual.term_year
fixdates.loc[fixdates.corrected != 1, 'term_year'] = fixdates_scraped.term_year_scraped
fixdates.loc[fixdates.corrected == 1, 'finalize_year'] = fixdates_manual.finalize_year
fixdates.loc[fixdates.corrected != 1, 'finalize_year'] = fixdates_scraped.finalize_year_scraped
fixdates.loc[fixdates.corrected == 1, 'finalize_month'] = fixdates_manual.finalize_month
fixdates.loc[fixdates.corrected != 1, 'finalize_month'] = fixdates_scraped.finalize_month_scraped

fixdates = fixdates[['corrected',  'title', 'term_phrase', 'term_year', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates.to_csv(os.path.join(data_path,'fix_dates.csv'))
fixdates.sample(10)

Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,link
291,0.0,New Caney ISD,1 District of Innovation Plan 2017-2022 2 I. I...,2017.0,"presented to Board for approval October 5, 201...",2018.0,October,https://tx50000191.schoolwires.net/cms/lib/TX5...
395,1.0,Latexo ISD,"a new teacher appraisal system in 2016-2017, t...",,TIMELINE 4/12/17 Adopting Resolution On April ...,2017.0,May,http://www.latexoisd.net/files/user/82/file/La...
460,0.0,Hurst-Euless-Bedford ISD,1 District of Innovation Plan 2017-2018 2 I. I...,2017.0,Board appointed Innovation Committee. October ...,2016.0,October,http://www.hebisd.edu/uploaded/Departments/Str...
566,0.0,Forney ISD,,-999.0,,-999.0,,http://media.wix.com/ugd/93fba2_25946c8c17ba44...
548,0.0,Galena Park ISD,,-999.0,,-999.0,,https://www.galenaparkisd.com/Domain/799
121,1.0,Springlake-Earth ISD,,2017.0,,2017.0,January,http://www.springlake-earth.org/publications/d...
147,1.0,Shelbyville ISD,SHELBYVILLE INDEPENDENT SCHOOL DISTRICT Distri...,2018.0,,2018.0,February,https://s3.amazonaws.com/scschoolfiles/464/dis...
410,0.0,La Joya ISD,PBMAS District Report Texas Education Agency 2...,-999.0,2011-12 -------| |------- 2010-11 -------| |--...,-999.0,,https://s3.amazonaws.com/scschoolfiles/1121/pb...
449,1.0,Italy ISD,,2017.0,,2017.0,May,http://www.italyisd.org/sites/italyisd.org/fil...
376,1.0,Littlefield ISD,LISD’s Local Innovation Plan will begin with t...,2017.0,"1842 Approved by DOI Committee May 4, 2017 App...",2017.0,May,http://www.littlefield.k12.tx.us/UserFiles/Ser...


### Check term and finalize year and month. Edit if incorrect if incorrectly missing. Then change corrected to 1 if corrected, zero if correct. Delete -999 if correctly missing. 

In [7]:
fixed_dates = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                          encoding = 'latin-1')

In [8]:
value = randint(0, 824)
value

168

In [9]:
# Ensure order does not change between data and fixed_dates

In [10]:
fixed_dates.loc[value].title

'Sam Rayburn ISD'

In [11]:
data.loc[value].title

'Sam Rayburn ISD'

In [12]:
# If order does not change,
fixed_dates['title'] = data.title

In [13]:
fixed_dates.loc[fixed_dates.term_year == -999, 'term_year'] = np.nan

print(fixed_dates.term_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.term_year)])

2017.0    474
2018.0    112
2016.0     46
2019.0     18
2015.0      1
Name: term_year, dtype: int64


173

In [14]:
fixed_dates.loc[fixed_dates.finalize_year == -999, 'finalize_year'] = np.nan

print(fixed_dates.finalize_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.finalize_year)])

2017.0    484
2018.0    126
2016.0     58
2019.0     17
Name: finalize_year, dtype: int64


139

Set doi_year as term year if available. Otherwise, finalize year

In [15]:
fixed_dates['doi_year'] = np.nan
fixed_dates.loc[pd.notnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.term_year
fixed_dates.loc[pd.isnull(fixed_dates.term_year), 'doi_year'] = fixed_dates.finalize_year
fixed_dates.sample(10)

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,link,doi_year
633,633,1.0,Cumby ISD,at some point in a summer before the 2021-2022...,2017.0,"for five years, April 2017 through March 2022,...",2017.0,April,https://s3.amazonaws.com/scschoolfiles/904/dis...,2017.0
767,767,1.0,Big Sandy ISD,Independent School District Local District of ...,2017.0,,,,https://s3.amazonaws.com/scschoolfiles/1550/lo...,2017.0
112,112,0.0,Stratford ISD,Stratford ISD District of Innovation 2017-2022...,2017.0,,,,https://s3.amazonaws.com/scschoolfiles/1795/st...,2017.0
646,646,0.0,Corsicana ISD,Corsicana ISD Innovation Plan 2017-2022 Corsic...,2017.0,"final plan Board of Trustees April 3, 2017 Not...",2017.0,April,http://cisd.org/cms/lib6/TX01917765/Centricity...,2017.0
228,228,0.0,Ponder ISD,Ponder Independent School District Local Innov...,2017.0,"commissioned an Innovation Committee. March 2,...",2017.0,April,http://www.ponderisd.net/cms/lib/TX01001056/Ce...,2017.0
668,668,0.0,Clyde ISD,CLYDE CISD DISTRICT OF INNOVATION PLAN 2018 - ...,2018.0,"Clyde CISD DOI Plan 2 May 21, 2018 â Recomme...",2018.0,May,http://www.clydeisd.org/upload/page/0075/docs/...,2018.0
292,292,1.0,New Boston ISD,"the Board of Trustees on February 27, 2017 and...",,"were approved by board on December 12, 2016 Co...",2017.0,February,https://www.nbschools.net/cms/lib/TX01918140/C...,2017.0
413,413,0.0,Krum ISD,2017-2018 KRUM INDEPENDENT SCHOOL DISTRICT,2017.0,"is posted to district website April 12, 2017 P...",2017.0,April,https://1.cdn.edl.io/dSxiQnu27rRakGne1yRexOQfk...,2017.0
608,608,1.0,Dripping Springs ISD,"2016-2021 Vision 1: Village By 2021, Dripping ...",2016.0,Texas Education Code Section 28.0216. DSISD Di...,,,https://www.dsisdtx.us/cms/lib/TX02204855/Cent...,2016.0
74,74,0.0,Tulia ISD,the Professional Development and Appraisal Sys...,,,,,https://1.cdn.edl.io/SI3O9hf2zcQSivYKVORh42oTl...,


In [16]:
print('District not missing doi year:', len(fixed_dates[pd.notnull(fixed_dates.doi_year)]))
fixed_dates.doi_year.value_counts()

District not missing doi year: 784


2017.0    564
2018.0    130
2016.0     67
2019.0     22
2015.0      1
Name: doi_year, dtype: int64

## Save

In [17]:
fixed_dates[['title', 'doi_year']].to_csv(os.path.join(data_path, 'doi_dates.csv'))

In [18]:
len(fixed_dates)

824