In [1]:
import os
%matplotlib inline
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from random import randint
import ast
from datetime import datetime
from start import data_path

In [2]:
data = pd.read_csv(os.path.join(data_path,'doi_dates_scraped.csv'), sep=",")
len(data)

824

In [3]:
data['term_month'] = data['term_month'].apply(lambda x: ast.literal_eval(x)[1])

There are two ways we can determine dates - the term put forward in the plan (this is almost always just a school year) or the date a plan was finalized. We prefer the term date. However, when the term is not noted, we count the implementation date as the latest date in the DOI timeline (hopefully, the date the plan was adopted - finalize_year).

In [4]:
data[['title', 'term_phrase', 'term_year', 'term_month',
      'finalize_phrase', 'finalize_year', 'finalize_month', 'link']].head()

Unnamed: 0,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,,"hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,September,"go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,,"27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,,,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,,"of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...


# Import Previous Dates

In [5]:
# Scraped links
fixdates_scraped = data[['title', 'term_phrase', 'term_year', 'term_month',
                        'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates_scraped = fixdates_scraped.rename(columns = 
                                           {'term_year': 'term_year_scraped',
                                            'term_month': 'term_month_scraped',
                                            'finalize_year': 'finalize_year_scraped',
                                            'finalize_month': 'finalize_month_scraped'})
fixdates_scraped

Unnamed: 0,title,term_phrase,term_year_scraped,term_month_scraped,finalize_phrase,finalize_year_scraped,finalize_month_scraped,link
0,Lake Travis ISD,: 1. Beginning with the 2017-2018 academic yea...,2017,,"hold a public meeting on December 13, 2016 to ...",2016,December,https://www.ltisdschools.org//cms/lib/Tx018000...
1,Zephyr ISD,Term The District of Innovation Plan will beco...,2018,September,"go to the Board on April 16th, 2018. Term The ...",2018,April,http://zephyrisd.net/wp-content/uploads/2014/0...
2,Zavalla ISD,This plan will be in effect for the 2017-2018 ...,2017,,"27,2017 Final version plan posted January 27, ...",2017,February,https://s3.amazonaws.com/scschoolfiles/1772/za...
3,Zapata County ISD,,-999,,,-999,,https://1.cdn.edl.io/VZ6fi1M6pn8Qk8tS3GetGewCD...
4,Yorktown ISD,Yorktown Independent School District Final Dis...,2017,,"of the letter. On May 15, 2017 the District Co...",2017,May,http://www.yisd.org/userfiles/57/my%20files/fi...
5,Yoakum ISD,,-999,,,-999,,http://www.yoakumisd.net/cms/lib3/TX01001553/C...
6,Yantis ISD,The District of Innovation Plan will become ef...,2017,June,"presented to the SBDM team on June, 2018. Yant...",2018,June,http://www.yantisisd.net/users/2017-2018/Distr...
7,Wylie ISD (221912),,-999,,,-999,,http://www.wyliebulldogs.org/cms/One.aspx?port...
8,Wylie ISD (043914),INNOVATION PLAN INTRODUCTION House Bill (HB) 1...,-999,,". The Committee met on December 14, 2016 and J...",2017,March,http://www.wylieisd.net/cms/lib09/TX01918453/C...
9,Wortham ISD,W Wortham ISD District of Innovation Plan 2017...,2017,,,-999,,https://s3.amazonaws.com/scschoolfiles/888/wis...


In [6]:
# Manual links
fixdates_manual = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                              encoding = 'latin-1')
fixdates_manual = fixdates_manual[['title','corrected', 'term_year', 'term_month',
                                   'finalize_year', 'finalize_month']]
print(len(fixdates_manual))
fixdates_manual.head(10)

824


Unnamed: 0,title,corrected,term_year,term_month,finalize_year,finalize_month
0,Â Lake Travis ISD,1,2016.0,December,2016.0,December
1,Zephyr ISD,1,2018.0,September,2018.0,April
2,Zavalla ISD,1,2017.0,,2017.0,February
3,Zapata County ISD,1,2018.0,,2018.0,May
4,Yorktown ISD,1,2017.0,,2017.0,May
5,Yoakum ISD,1,2018.0,,,
6,Yantis ISD,1,2018.0,June,2018.0,June
7,Wylie ISD (221912),1,2017.0,August,2017.0,April
8,Wylie ISD (043914),1,2017.0,March,,March
9,Wortham ISD,1,2017.0,,,


In [7]:
# Merge scraped with manual
fixdates = fixdates_scraped.merge(fixdates_manual, how = 'left', 
                                  left_on = 'title', right_on = 'title')

fixdates.loc[fixdates.corrected > 0, 'term_year'] = fixdates_manual.term_year
fixdates.loc[fixdates.corrected == 0, 'term_year'] = fixdates_scraped.term_year_scraped
fixdates.loc[fixdates.corrected > 0, 'finalize_year'] = fixdates_manual.finalize_year
fixdates.loc[fixdates.corrected == 0, 'finalize_year'] = fixdates_scraped.finalize_year_scraped
fixdates.loc[fixdates.corrected > 0, 'finalize_month'] = fixdates_manual.finalize_month
fixdates.loc[fixdates.corrected == 0, 'finalize_month'] = fixdates_scraped.finalize_month_scraped

fixdates = fixdates[['corrected',  'title', 'term_phrase', 'term_year', 'finalize_phrase', 'finalize_year', 'finalize_month', 'link']]
fixdates.to_csv(os.path.join(data_path,'fix_dates.csv'))
fixdates.sample(10)

Unnamed: 0,corrected,title,term_phrase,term_year,finalize_phrase,finalize_year,finalize_month,link
398,1.0,Lancaster ISD,EC LEGAL & EB LEGAL) Currently: House Bill 261...,,or designee for approval. Connection to Vision...,2017.0,June,https://1.cdn.edl.io/137mVbKToE1MPNCk5H3SX2QEA...
285,1.0,Newcastle ISD,,2017.0,,,,https://newcastle-isd.socs.net/pages/uploaded_...
635,1.0,Cuero ISD,District of Innovation Plan will become effect...,2017.0,"District Site Based Committee Meeting June 21,...",2018.0,June,http://www.cueroisd.org/userfiles/10/My Files/...
176,1.0,Royse City ISD,"before the 4th Monday of August. For 2017-18, ...",2017.0,"proposed plan. Monday, January 23, 2017, Speci...",2017.0,January,https://www.rcisd.org/doi/
649,1.0,Coppell ISD,"out this work. On May 31, 2016, a small group of",,the Coppell ISD Board of Trustees on November ...,2016.0,November,http://www.coppellisd.com/cms/lib09/TX01000550...
30,1.0,White Settlement ISD,purpose for tomorrow. District of Innovation P...,2017.0,"Final District of Innovation Plan July 10, 201...",2018.0,July,http://www.wsisd.com/ourpages/users/dcoyle/Cur...
172,1.0,Sabinal ISD,,2017.0,,2017.0,March,http://www.sabinalstorage.tech/requiredposting...
276,1.0,Northwest ISD,guests from campuses and school events. In 201...,,,2019.0,January,https://www.nisdtx.org/UserFiles/Servers/Serve...
39,1.0,West Orange-Cove CISD,"this year’s calendar with August 1, 2017 being...",2017.0,,,,https://1.cdn.edl.io/auJoyA24xmsCNh4FKIed8ce47...
419,1.0,Klein ISD,This plan will be in effect for the 2017-2018 ...,2017.0,")] * Monday, February 13, 2017 – Board Meeting...",2017.0,February,http://www.kleinisd.net/UserFiles/Servers/Serv...


### Check term and finalize year and month. Edit if incorrect if incorrectly missing. Then change corrected to 1 if corrected. Delete -999 if correctly missing. 

In [8]:
fixed_dates = pd.read_csv(os.path.join(data_path, 'fix_dates_corrected.csv'), 
                          encoding = 'latin-1')

In [9]:
value = randint(0, 824)
value

160

In [10]:
# Ensure order does not change between data and fixed_dates

In [11]:
fixed_dates.loc[value].title

'Sanger ISD'

In [12]:
data.loc[value].title

'Sanger ISD'

In [13]:
# If order does not change,
fixed_dates['title'] = data.title

In [14]:
fixed_dates.loc[fixed_dates.term_year == -999, 'term_year'] = np.nan

print(fixed_dates.term_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.term_year)])

2017.0    491
2018.0    115
2016.0     46
2019.0     18
2015.0      1
Name: term_year, dtype: int64


153

In [15]:
fixed_dates.loc[fixed_dates.finalize_year == -999, 'finalize_year'] = np.nan

print(fixed_dates.finalize_year.value_counts())
len(fixed_dates[pd.isnull(fixed_dates.finalize_year)])

2017.0    484
2018.0    125
2016.0     60
2019.0     17
Name: finalize_year, dtype: int64


138

# Create Time Stamp Variables

In [16]:
str_int_converter = lambda x: x if np.isnan(x) else str(int(x))
date_converter = lambda x: x if not isinstance(x, str) else datetime.strptime(x, '%d %B, %Y')

In [17]:
fixed_dates['term_month'] = np.where(((fixed_dates.term_year.notnull()) & (fixed_dates.term_month.isnull())),
                                      'August', fixed_dates.term_month)
# [(x,y) for x in seq_x for y in seq_y]
# fixed_dates['term_date'] = [pd.Timestamp(year = int(year), month = month_dict[month], day = 1)
#                            for year in fixed_dates.term_year for month in fixed_dates.term_month if not np.isnan(year)]
# pd.Timestamp(year = fixed_dates.term_date, month = 2, day = 1)
fixed_dates['term_date'] = '1 '+ fixed_dates['term_month'] + ', ' + fixed_dates['term_year'].apply(str_int_converter)
# fixed_dates['term_date'] = pd.Timestamp(fixed_dates.term_date)
fixed_dates['term_date'] = fixed_dates['term_date'].apply(date_converter)
fixed_dates.sample()

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link,term_date
588,588,1,Ennis ISD,,2017.0,August,,2017.0,April,https://www.ennis.k12.tx.us/cms/lib/TX02216841...,2017-08-01


In [18]:
fixed_dates['finalize_date'] = '1 '+ fixed_dates['finalize_month'] + ', ' + fixed_dates['finalize_year'].apply(str_int_converter)
fixed_dates['finalize_date'] = fixed_dates['finalize_date'].apply(date_converter)
fixed_dates['finalize_date'] = pd.to_datetime(fixed_dates.finalize_date, infer_datetime_format=True)

fixed_dates.sample()

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link,term_date,finalize_date
703,703,1,Carroll ISD,statute inhibits the plan: Beginning with the ...,,,". The Committee met on September 26, 2017 and ...",2017.0,December,https://www.southlakecarroll.edu/cms/lib/TX022...,NaT,2017-12-01


Set doi_date as term date if available. Otherwise, finalize date

In [19]:
# fixed_dates['doi_date'] = np.nan
fixed_dates.loc[pd.notnull(fixed_dates.term_date), 'doi_date'] = fixed_dates.term_date
fixed_dates.loc[pd.isnull(fixed_dates.term_date), 'doi_date'] = fixed_dates.finalize_date
#fixed_dates['doi_date'] = fixed_dates['doi_date'].apply(datetime.fromtimestamp)
fixed_dates.sample(10)

Unnamed: 0.1,Unnamed: 0,corrected,title,term_phrase,term_year,term_month,finalize_phrase,finalize_year,finalize_month,link,term_date,finalize_date,doi_date
599,599,1,Ector County ISD,term of the Plan will begin with the 2017-2018...,2017.0,August,"final Local Innovation Plan on April 11, 2017*...",2017.0,April,http://www.ectorcountyisd.org/cms/lib011/tx010...,2017-08-01,2017-04-01,2017-08-01
619,619,1,Denton ISD,Updated) | Terms of Use Copyright Â© 2002-2019...,,,DOI Public Hearing 12/13/16 DOI Campus Present...,2017.0,January,http://www.dentonisd.org/Page/84561,NaT,2017-01-01,2017-01-01
345,345,1,Martin's Mill ISD,is issuing a new teacher appraisal system in 2...,2019.0,January,"January 14, 2019 through January 14, 2024. Thi...",2019.0,January,http://www.martinsmillisd.net/UserFiles/Server...,2019-01-01,2019-01-01,2019-01-01
360,360,1,Lueders-Avoca ISD,"five-year period running through October 1, 20...",2017.0,September,"by the Board of Trustees September 25, 2017 20...",2017.0,September,http://www.laisd.esc14.net/page/APPROVEDDOI,2017-09-01,2017-09-01,2017-09-01
204,204,1,Redwater ISD,Redwater Independent School District District ...,2017.0,March,of Redwater ISD Local Innovation Plan begins M...,2017.0,January,https://s3.amazonaws.com/scschoolfiles/757/dis...,2017-03-01,2017-01-01,2017-03-01
79,79,1,Tornillo ISD,,2018.0,August,,2018.0,April,http://www.tisd.us/district_accountability/dis...,2018-08-01,2018-04-01,2018-08-01
48,48,1,Water Valley ISD,WILDCAT DRIVE WATER VALLEY. TEXAS 76958 325-48...,,,"District of Innovation. â¢ January 17, 2018 D...",2018.0,January,http://www.wvisd.net/cms/lib/TX01001412/Centri...,NaT,2018-01-01,2018-01-01
44,44,1,Wellington ISD,a school year before May 15th. House Bill 2610...,2017.0,February,". The Committee met on January 3, 2017 to disc...",,,https://s3.amazonaws.com/scschoolfiles/1189/di...,2017-02-01,NaT,2017-02-01
786,786,1,Avalon ISD,in effect for the next five years (2017-2022)....,2017.0,August,"the Avalon Board of Trustees on April 20, 2017...",2017.0,April,https://1.cdn.edl.io/bXdfHD9YuC8GyHmQI2BxjMTHm...,2017-08-01,2017-04-01,2017-08-01
529,529,1,Gonzales ISD,MEMORANDUM Gonzales Independent School Distric...,2017.0,August,"approved the Final Draft on April 10th, 2017 p...",2017.0,April,https://tx02217206.schoolwires.net//cms/lib/TX...,2017-08-01,2017-04-01,2017-08-01


In [20]:
fixed_dates.doi_date.dtype

dtype('<M8[ns]')

In [26]:
fixed_dates['doi_date'].dt.year

0      2016.0
1      2018.0
2      2017.0
3      2018.0
4      2017.0
        ...  
819    2018.0
820    2018.0
821    2018.0
822    2017.0
823    2018.0
Name: doi_date, Length: 824, dtype: float64

In [28]:
print('District not missing doi date', len(fixed_dates[pd.notnull(fixed_dates.doi_date)]))
fixed_dates['doi_date'].dt.year.value_counts()

District not missing doi date 812


2017.0    588
2018.0    133
2016.0     68
2019.0     22
2015.0      1
Name: doi_date, dtype: int64

## Save

In [30]:
fixed_dates[['title', 'doi_date']].to_csv(os.path.join(data_path, 'doi_dates.csv'))

In [31]:
len(fixed_dates)

824