In [32]:
import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from random import randint
import ast
from datetime import datetime
from start import data_path

In [33]:
data = pd.read_csv(os.path.join(data_path,'doi_dates_scraped.csv'), sep=",")
len(data)

824

In [34]:
data['term_month'] = data['term_month'].apply(lambda x: ast.literal_eval(x)[1])

There are two ways we can determine dates - the term put forward in the plan (this is almost always just a school year) or the date a plan was finalized. We prefer the term date. However, when the term is not noted, we count the implementation date as the latest date in the DOI timeline (hopefully, the date the plan was adopted - finalize_year).

In [35]:
data[['title', 'term_phrase', 'term_year', 'term_month',
      'finalize_phrase', 'finalize_year', 'finalize_month', 'link']].head()

Unnamed: 0,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,,"hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,September,"go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,,"27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,,,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,,"of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...


# Import Previous Dates

In [36]:
# Scraped links
fixdates_scraped = data[['title', 'term_phrase', 'term_year', 'term_month',
                        'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates_scraped = fixdates_scraped.rename(columns = 
                                           {'term_year': 'term_year_scraped',
                                            'term_month': 'term_month_scraped',
                                            'finalize_year': 'finalize_year_scraped',
                                            'finalize_month': 'finalize_month_scraped'})
fixdates_scraped

Unnamed: 0,title,term_phrase,term_year_scraped,term_month_scraped,finalize_phrase,finalize_year_scraped,finalize_month_scraped,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,,"hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,September,"go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,,"27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,,,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,,"of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...
5,Yoakum ISD,,-999,,,-999,,http://www.yoakumisd.net/cms/lib3/TX01001553/C...
6,Yantis ISD,The District of Innovation Plan will become ef...,2017,June,"presented to the SBDM team on June, 2018. Yant...",2018,June,http://www.yantisisd.net/users/2017-2018/Distr...
7,Wylie ISD (221912),,-999,,,-999,,http://www.wyliebulldogs.org/cms/One.aspx?port...
8,Wylie ISD (043914),INNOVATION PLAN INTRODUCTION House Bill (HB) 1...,-999,,". The Committee met on December 14, 2016 and J...",2017,March,http://www.wylieisd.net/cms/lib09/TX01918453/C...
9,Wortham ISD,W Wortham ISD District of Innovation Plan 2017...,2017,,,-999,,https://s3.amazonaws.com/scschoolfiles/888/wis...


In [37]:
# Manual links
fixdates_manual = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                              encoding = 'latin-1')
fixdates_manual = fixdates_manual[['title','corrected', 'term_year', 'term_month',
                                   'finalize_year', 'finalize_month']]
print(len(fixdates_manual))
fixdates_manual.head(10)

824


Unnamed: 0,title,corrected,term_year,term_month,finalize_year,finalize_month
0,Â Lake Travis ISD,1,2016.0,December,2016.0,December
1,Zephyr ISD,1,2018.0,September,2018.0,April
2,Zavalla ISD,1,2017.0,,2017.0,February
3,Zapata County ISD,1,2018.0,,2018.0,May
4,Yorktown ISD,1,2017.0,,2017.0,May
5,Yoakum ISD,1,2018.0,,,
6,Yantis ISD,1,2018.0,June,2018.0,June
7,Wylie ISD (221912),1,2017.0,August,2017.0,April
8,Wylie ISD (043914),1,2017.0,March,,March
9,Wortham ISD,1,2017.0,,,


In [38]:
# Merge scraped with manual
fixdates = fixdates_scraped.merge(fixdates_manual, how = 'left', 
                                  left_on = 'title', right_on = 'title')

fixdates.loc[fixdates.corrected > 0, 'term_year'] = fixdates_manual.term_year
fixdates.loc[fixdates.corrected == 0, 'term_year'] = fixdates_scraped.term_year_scraped
fixdates.loc[fixdates.corrected > 0, 'finalize_year'] = fixdates_manual.finalize_year
fixdates.loc[fixdates.corrected == 0, 'finalize_year'] = fixdates_scraped.finalize_year_scraped
fixdates.loc[fixdates.corrected > 0, 'finalize_month'] = fixdates_manual.finalize_month
fixdates.loc[fixdates.corrected == 0, 'finalize_month'] = fixdates_scraped.finalize_month_scraped

fixdates = fixdates[['corrected',  'title', 'term_phrase', 'term_year', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates.to_csv(os.path.join(data_path,'fix_dates.csv'))
fixdates.sample(10)

Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,link
465,1.0,Hughes Springs ISD,"starting school earlier, 5 days earlier for 20...",2017.0,Hughes Springs ISD District of Innovation Plan...,2017.0,April,http://www.hsisd.net/upload/page/0061/docs/HSI...
636,1.0,Crowley ISD,and will terminate at the end of the 2021-2022...,,"board for final approval on March 30, 2017. Th...",2017.0,March,https://www.crowleyisdtx.org/cms/lib5/TX019177...
755,1.0,Boles ISD,"up to five years. Wednesday June 1, 2016 SBDM ...",2017.0,"by ⅔ majority vote on February 7, 2017 March 2...",2017.0,March,https://s3.amazonaws.com/scschoolfiles/982/dis...
433,1.0,Karnes City ISD,84th Legislature passed House Bill (HB) 1842 w...,2017.0,"committee approved the plan on April 3, 2017. ...",2017.0,April,http://s3.amazonaws.com/scschoolfiles/1013/app...
574,1.0,Farwell ISD,FARWELL ISD District of Innovation Plan School...,2017.0,"hearing regarding the Plan on March 20, 2017 a...",2017.0,March,http://www.farwellschools.org/upload/page/0033...
718,1.0,Bushland ISD,2017 – 2018 school year through the 2021 – 202...,2017.0,"December 9 th , 2016 to January 7 th , 2017. T...",2017.0,January,http://www.bushlandisd.net/UserFiles/Servers/S...
773,1.0,Beeville ISD,,2017.0,,2017.0,March,https://s3.amazonaws.com/scschoolfiles/380/bis...
176,1.0,Royse City ISD,"before the 4th Monday of August. For 2017-18, ...",2017.0,"proposed plan. Monday, January 23, 2017, Speci...",2017.0,January,https://www.rcisd.org/doi/
75,1.0,Troup ISD,,2017.0,,,,https://www.troupisd.org/1617/innovation.pdf
220,1.0,Prairiland ISD,Term The District of Innovation Plan will beco...,2017.0,Post District of Innovation Plan on PISD websi...,2017.0,March,https://s3.amazonaws.com/scschoolfiles/233/dis...


### Check term and finalize year and month. Edit if incorrect if incorrectly missing. Then change corrected to 1 if corrected. Delete -999 if correctly missing. 

In [39]:
fixed_dates = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                          encoding = 'latin-1')

In [40]:
value = randint(0, 824)
value

325

In [41]:
# Ensure order does not change between data and fixed_dates

In [42]:
fixed_dates.loc[value].title

'Midland ISD'

In [43]:
data.loc[value].title

'Midland ISD'

In [44]:
# If order does not change,
fixed_dates['title'] = data.title

In [45]:
fixed_dates.loc[fixed_dates.term_year == -999, 'term_year'] = np.nan

print(fixed_dates.term_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.term_year)])

2017.0    491
2018.0    115
2016.0     46
2019.0     18
2015.0      1
Name: term_year, dtype: int64


153

In [46]:
fixed_dates.loc[fixed_dates.finalize_year == -999, 'finalize_year'] = np.nan

print(fixed_dates.finalize_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.finalize_year)])

2017.0    484
2018.0    125
2016.0     60
2019.0     17
Name: finalize_year, dtype: int64


138

# Create Time Stamp Variables

In [47]:
str_int_converter = lambda x: x if np.isnan(x) else str(int(x))
date_converter = lambda x: x if not isinstance(x, str) else datetime.strptime(x, '%d %B, %Y')

In [48]:
fixed_dates['term_month'] = np.where(((fixed_dates.term_year.notnull()) & (fixed_dates.term_month.isnull())),
                                      'August', fixed_dates.term_month)
# [(x,y) for x in seq_x for y in seq_y]
# fixed_dates['term_date'] = [pd.Timestamp(year = int(year), month = month_dict[month], day = 1)
#                            for year in fixed_dates.term_year for month in fixed_dates.term_month if not np.isnan(year)]
# pd.Timestamp(year = fixed_dates.term_date, month = 2, day = 1)
fixed_dates['term_date'] = '1 '+ fixed_dates['term_month'] + ', ' + fixed_dates['term_year'].apply(str_int_converter)
# fixed_dates['term_date'] = pd.Timestamp(fixed_dates.term_date)
fixed_dates['term_date'] = fixed_dates['term_date'].apply(date_converter)
fixed_dates.sample()

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link,term_date
537,537,1,Giddings ISD,", 2019- 2020, 2020-2021, 2021-2022, and 2022-2...",2018.0,August,"District. March 20 â April 18, 2018 â The ...",2018.0,April,https://4.files.edl.io/aa1d/05/03/18/132155-6c...,2018-08-01


In [49]:
fixed_dates['finalize_date'] = '1 '+ fixed_dates['finalize_month'] + ', ' + fixed_dates['finalize_year'].apply(str_int_converter)
fixed_dates['finalize_date'] = fixed_dates['finalize_date'].apply(date_converter)
fixed_dates['finalize_date'] = pd.to_datetime(fixed_dates.finalize_date, infer_datetime_format=True)

fixed_dates.sample()

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link,term_date,finalize_date
405,405,1,LaPoynor ISD,,2017.0,August,,,,https://1.cdn.edl.io/lG8iShCiFwO4dVHrm9O0ryw1e...,2017-08-01,NaT


Set doi_date as term date if available. Otherwise, finalize date

In [50]:
# fixed_dates['doi_date'] = np.nan
fixed_dates.loc[pd.notnull(fixed_dates.term_date), 'doi_date'] = fixed_dates.term_date
fixed_dates.loc[pd.isnull(fixed_dates.term_date), 'doi_date'] = fixed_dates.finalize_date
#fixed_dates['doi_date'] = fixed_dates['doi_date'].apply(datetime.fromtimestamp)
fixed_dates.sample(10)

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link,term_date,finalize_date,doi_date
525,525,1,Grady ISD,,2019.0,August,,2019.0,March,https://core-docs.s3.amazonaws.com/documents/a...,2019-08-01,2019-03-01,2019-08-01
459,459,1,Hutto ISD,,,,,2016.0,January,https://d3jc3ahdjad7x7.cloudfront.net/RqRUMg6y...,NaT,2016-01-01,2016-01-01
4,4,1,Yorktown ISD,Yorktown Independent School District Final Dis...,2017.0,August,"of the letter. On May 15, 2017 the District Co...",2017.0,May,http://www.yisd.org/userfiles/57/my%20files/fi...,2017-08-01,2017-05-01,2017-08-01
386,386,1,Lewisville ISD,Innovation Lewisville ISD Local Innovation Pla...,2017.0,May,"Local Innovation Plan begins on May 16, 2017, ...",2017.0,February,https://www.lisd.net/cms/lib/TX01918037/Centri...,2017-05-01,2017-02-01,2017-05-01
429,429,1,Keller ISD,"for five years, beginning January 1, 2017, and...",2017.0,January,"of Innovation Committee Approved: November 3, ...",2017.0,January,https://www.kellerisd.net/cms/lib/TX02215599/C...,2017-01-01,2017-01-01,2017-01-01
475,475,1,Holliday ISD,span from the beginning of the 2017-2018 schoo...,2017.0,August,,,,https://s3.amazonaws.com/scschoolfiles/1648/ho...,2017-08-01,NaT,2017-08-01
650,650,1,Cooper ISD,"effective for five years, beginning in the 201...",2017.0,August,"the Board of Trustees on January 2, 2017. A Di...",2017.0,January,https://s3.amazonaws.com/scschoolfiles/821/cis...,2017-08-01,2017-01-01,2017-08-01
416,416,1,Knox City-O’Brien CISD,Consolidated Independent School District Local...,,,The Local Innovation Committee then met on Mar...,2017.0,March,http://www.knoxcityschools.net/vimages/shared/...,NaT,2017-03-01,2017-03-01
389,389,1,Leonard ISD,"DEIC). Thursday, December 1, 2016 â 3:45p.m....",,,"innovation plan. Wednesday, March 1, 2017 - 3:...",2017.0,March,http://www.leonardisd.net/upload/page/0085/LIS...,NaT,2017-03-01,2017-03-01
453,453,1,Ira ISD,for District of Innovation designation August ...,2016.0,August,"of expectations and uses. February 3, 2017 Req...",2017.0,February,http://www.ira.esc14.net/upload/page/0001/imag...,2016-08-01,2017-02-01,2016-08-01


In [51]:
fixed_dates.doi_date.dtype

dtype('<M8[ns]')

In [52]:
fixed_dates['doi_date'].dt.year

0      2016.0
1      2018.0
2      2017.0
3      2018.0
4      2017.0
        ...  
819    2018.0
820    2018.0
821    2018.0
822    2017.0
823    2018.0
Name: doi_date, Length: 824, dtype: float64

In [53]:
print('District not missing doi date', len(fixed_dates[pd.notnull(fixed_dates.doi_date)]))
fixed_dates['doi_date'].dt.year.value_counts()

District not missing doi date 812


2017.0    588
2018.0    133
2016.0     68
2019.0     22
2015.0      1
Name: doi_date, dtype: int64

## Save

In [54]:
fixed_dates[['title', 'doi_date']].to_csv(os.path.join(data_path, 'doi_dates.csv'))

In [55]:
len(fixed_dates)

824