# One hot save and clean

In [10]:
import pandas as pd
import csv
import os
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from start import data_path

In [11]:
doi_df = pd.read_csv(os.path.join(data_path, 'doi_dates_and_exemptions.csv'))

## Save create one-hot variables for laws

In [12]:
def make_list(string):
    return [i.replace('[','').replace(']','') for i in string.split(', ')]

doi_df['possible_laws'] = doi_df['possible_laws'].apply(make_list)

In [13]:
mlb = MultiLabelBinarizer()
plans_onehot = doi_df.join(pd.DataFrame(mlb.fit_transform(doi_df.pop('possible_laws')),
                          columns=mlb.classes_,
                          index=doi_df.index))
plans_onehot.head(10)

Unnamed: 0.2,Unnamed: 0,title,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,level,type,link,text,...,551.083,551.084,552.023,552.024,7.056,75.600,8.210,921.406,97.1001,97.1055
0,0,Abbott ISD,780.0,2481.0,2481,2481,Second,pdf,https://www.abbottisd.org/ourpages/auto/2018/3...,1 Abbott Independent School District District ...,...,0,0,0,0,0,0,0,0,0,0
1,1,Abernathy ISD,779.0,871.0,871,871,Second,pdf,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,...,0,0,0,0,0,0,0,0,0,0
2,2,Abilene ISD,778.0,85.0,85,85,First,pdf,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,...,0,0,0,0,0,0,0,0,0,0
3,3,Academy ISD,777.0,2845.0,2845,2845,Second,docx,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,...,0,0,0,0,0,0,0,0,0,0
4,4,Adrian ISD,776.0,169.0,169,169,First,pdf,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,...,0,0,0,0,0,0,0,0,0,0
5,5,Agua Dulce ISD,775.0,1868.0,1868,1868,Second,pdf,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,...,0,0,0,0,0,0,0,0,0,0
6,6,Alamo Heights ISD,774.0,1824.0,1824,1824,Second,pdf,http://www.ahisd.net/UserFiles/Servers/Server_...,1 | P a g e Mission Statement The Alamo Height...,...,0,0,0,0,0,0,0,0,0,0
7,7,Albany ISD,773.0,1886.0,1886,1886,Second,pdf,http://www.albanyisd.net/uploads/4/4/4/1/44419...,Microsoft Word - innovation_plan.docx Albany I...,...,0,0,0,0,0,0,0,0,0,0
8,8,Aldine ISD,772.0,2681.0,2681,2681,Second,pdf,http://insidealdine.com/wp-content/uploads/201...,Aldine District of Innovation Plan Approved by...,...,0,0,0,0,0,0,0,0,0,0
9,9,Alice ISD,771.0,176.0,176,176,First,google,https://drive.google.com/file/d/1v1wysSRUqD5FX...,Microsoft Word - District of Innovation Docume...,...,0,0,0,0,0,0,0,0,0,0


In [14]:
plans_onehot = plans_onehot.drop(['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','Unnamed: 0.1.1.1','Unnamed: 0.1.1.1.1'], axis=1)
plans_onehot.head()

Unnamed: 0,title,level,type,link,text,p_innovation,term_year,term_month,term_phrase,term_p,...,551.083,551.084,552.023,552.024,7.056,75.600,8.210,921.406,97.1001,97.1055
0,Abbott ISD,Second,pdf,https://www.abbottisd.org/ourpages/auto/2018/3...,1 Abbott Independent School District District ...,0.999949,2023,March,for the next five years (March 2018-February 2...,0.987736,...,0,0,0,0,0,0,0,0,0,0
1,Abernathy ISD,Second,pdf,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,0.99834,-999,,of Innovation Plan – Draft Introduction House ...,0.025909,...,0,0,0,0,0,0,0,0,0,0
2,Abilene ISD,First,pdf,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,0.999955,2017,April,ABILENE INDEPENDENT SCHOOL DISTRICT Local Inno...,0.846878,...,0,0,0,0,0,0,0,0,0,0
3,Academy ISD,Second,docx,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,0.999836,2017,,2018-2019 school year and concluding at the en...,0.999955,...,0,0,0,0,0,0,0,0,0,0
4,Adrian ISD,First,pdf,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,0.999945,2018,June,term of up to five years beginning June 2018 a...,0.999955,...,0,0,0,0,0,0,0,0,0,0


### Rename exemptions so they start with reg

In [15]:
plans_onehot.columns = ['reg' + str(i).replace('.', '_') for i in plans_onehot.columns]
plans_onehot = plans_onehot.drop(['reg'], axis=1)
plans_onehot = plans_onehot.rename(index=str, columns= {'regtitle':'district', 
                                                       'reglevel' : 'level',
                                                       'regtype': 'type',
                                                        'reglink': 'link',
                                                        'regtext': 'text',
                                                        'regp_innovation':'p_doi',
                                                        'regdate': 'date',
                                                        'regdate_phrase': 'date_phrase',
                                                        'regdate_p': 'date_p',
                                                        'regterm_year': 'term_year',
                                                        'regterm_month': 'term_month',
                                                        'regterm_phrase': 'term_phrase',
                                                        'regterm_p': 'term_p',
                                                        'regdate_ay' : 'date_ay',
                                                         'regdate_month': 'date_month',
                                                         'regdate_year': 'date_year',
                                                         'regfinalize_month': 'finalize_month',
                                                         'regfinalize_p': 'finalize_p',
                                                         'regfinalize_phrase': 'finalize_phrase',
                                                         'regfinalize_year': 'finalize_year',
                                                        'regdoi_year': 'doi_year'})
#plans_onehot = plans_onehot.drop(columns = ['regindex'])
plans_onehot.head(10)

Unnamed: 0,district,level,type,link,text,p_doi,term_year,term_month,term_phrase,term_p,...,reg551_083,reg551_084,reg552_023,reg552_024,reg7_056,reg75_600,reg8_210,reg921_406,reg97_1001,reg97_1055
0,Abbott ISD,Second,pdf,https://www.abbottisd.org/ourpages/auto/2018/3...,1 Abbott Independent School District District ...,0.999949,2023,March,for the next five years (March 2018-February 2...,0.987736,...,0,0,0,0,0,0,0,0,0,0
1,Abernathy ISD,Second,pdf,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,0.99834,-999,,of Innovation Plan – Draft Introduction House ...,0.025909,...,0,0,0,0,0,0,0,0,0,0
2,Abilene ISD,First,pdf,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,0.999955,2017,April,ABILENE INDEPENDENT SCHOOL DISTRICT Local Inno...,0.846878,...,0,0,0,0,0,0,0,0,0,0
3,Academy ISD,Second,docx,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,0.999836,2017,,2018-2019 school year and concluding at the en...,0.999955,...,0,0,0,0,0,0,0,0,0,0
4,Adrian ISD,First,pdf,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,0.999945,2018,June,term of up to five years beginning June 2018 a...,0.999955,...,0,0,0,0,0,0,0,0,0,0
5,Agua Dulce ISD,Second,pdf,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,0.999955,2017,,. Agua Dulce ISD District of Innovation Plan 2...,0.999309,...,0,0,0,0,0,0,0,0,0,0
6,Alamo Heights ISD,Second,pdf,http://www.ahisd.net/UserFiles/Servers/Server_...,1 | P a g e Mission Statement The Alamo Height...,0.997152,2016,,"a new teacher appraisal system in 2016-2017, c...",0.99821,...,0,0,0,0,0,0,0,0,0,0
7,Albany ISD,Second,pdf,http://www.albanyisd.net/uploads/4/4/4/1/44419...,Microsoft Word - innovation_plan.docx Albany I...,0.999931,2017,February,The Local Innovation Plan Committee met Februa...,0.353087,...,0,0,0,0,0,0,0,0,0,0
8,Aldine ISD,Second,pdf,http://insidealdine.com/wp-content/uploads/201...,Aldine District of Innovation Plan Approved by...,0.999702,2016,,of teacher practice. During the 2016-2017 scho...,0.998446,...,0,0,0,0,0,0,0,0,0,0
9,Alice ISD,First,google,https://drive.google.com/file/d/1v1wysSRUqD5FX...,Microsoft Word - District of Innovation Docume...,0.998974,2017,,Plan for District of Innovation Term 2018-2019...,0.999949,...,0,0,0,0,0,0,0,0,0,0


# Save

In [16]:
plans_onehot.to_csv((os.path.join(data_path, 'doi_final_wtext.csv')))
plans_onehot = plans_onehot.drop(['text'], axis = 1)
plans_onehot.to_csv((os.path.join(data_path, 'doi_final.csv')))
plans_onehot.to_csv((os.path.join('/Users/kylieleblancKylie/domino/dofis/data/clean', 'doi_final.csv')))