# One hot save and clean

In [241]:
import pandas as pd
import csv
import os
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [242]:
doi_df = pd.read_csv(os.path.join(os.getcwd(), '..', '..', 'data', 'plans', 'doi_dates_and_exemptions.csv'))

## Make list of all collected laws

In [243]:
def make_list(string):
    return [i.replace('[','').replace(']','') for i in string.split(', ')]

doi_df['possible_laws'] = doi_df['possible_laws'].apply(make_list)

In [244]:
mlb = MultiLabelBinarizer()
plans_onehot = doi_df.join(pd.DataFrame(mlb.fit_transform(doi_df.pop('possible_laws')),
                          columns=mlb.classes_,
                          index=doi_df.index))
plans_onehot.head(10)

Unnamed: 0.2,title,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,level,type,link,text,p_innovation,...,551.083,551.084,552.023,552.024,7.056,75.600,8.210,921.406,97.1001,97.1055
0,Abbott ISD,780.0,2481.0,2481,2481,Second,pdf,https://www.abbottisd.org/ourpages/auto/2018/3...,1 Abbott Independent School District District ...,0.999949,...,0,0,0,0,0,0,0,0,0,0
1,Abernathy ISD,779.0,871.0,871,871,Second,pdf,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,0.99834,...,0,0,0,0,0,0,0,0,0,0
2,Abilene ISD,778.0,85.0,85,85,First,pdf,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,0.999955,...,0,0,0,0,0,0,0,0,0,0
3,Academy ISD,777.0,2845.0,2845,2845,Second,docx,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,0.999836,...,0,0,0,0,0,0,0,0,0,0
4,Adrian ISD,776.0,169.0,169,169,First,pdf,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,0.999945,...,0,0,0,0,0,0,0,0,0,0
5,Agua Dulce ISD,775.0,1868.0,1868,1868,Second,pdf,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,0.999955,...,0,0,0,0,0,0,0,0,0,0
6,Alamo Heights ISD,774.0,1824.0,1824,1824,Second,pdf,http://www.ahisd.net/UserFiles/Servers/Server_...,1 | P a g e Mission Statement The Alamo Height...,0.997152,...,0,0,0,0,0,0,0,0,0,0
7,Albany ISD,773.0,1886.0,1886,1886,Second,pdf,http://www.albanyisd.net/uploads/4/4/4/1/44419...,Microsoft Word - innovation_plan.docx Albany I...,0.999931,...,0,0,0,0,0,0,0,0,0,0
8,Aldine ISD,772.0,2681.0,2681,2681,Second,pdf,http://insidealdine.com/wp-content/uploads/201...,Aldine District of Innovation Plan Approved by...,0.999702,...,0,0,0,0,0,0,0,0,0,0
9,Alice ISD,771.0,176.0,176,176,First,google,https://drive.google.com/file/d/1v1wysSRUqD5FX...,Microsoft Word - District of Innovation Docume...,0.998974,...,0,0,0,0,0,0,0,0,0,0


In [245]:
plans_onehot = plans_onehot.drop(['text', 'date_phrase', 'Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','Unnamed: 0.1.1.1'], axis=1)
plans_onehot.head()

Unnamed: 0,title,level,type,link,p_innovation,date,date_p,Unnamed: 8,1.251,1.253,...,551.083,551.084,552.023,552.024,7.056,75.600,8.210,921.406,97.1001,97.1055
0,Abbott ISD,Second,pdf,https://www.abbottisd.org/ourpages/auto/2018/3...,0.999949,2018.0,0.949854,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Abernathy ISD,Second,pdf,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,0.99834,2017.0,0.307943,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abilene ISD,First,pdf,https://www.abileneisd.org/wp-content/uploads/...,0.999955,2018.0,0.999955,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Academy ISD,Second,docx,https://4.files.edl.io/1a8f/06/29/18/204245-44...,0.999836,2018.0,0.348899,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adrian ISD,First,pdf,http://www.adrianisd.net/UserFiles/Servers/Ser...,0.999945,2018.0,0.999955,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Rename exemptions so they start with reg

In [246]:
plans_onehot.columns = ['reg' + str(i).replace('.', '_') for i in plans_onehot.columns]
plans_onehot = plans_onehot.drop(['reg'], axis=1)
plans_onehot = plans_onehot.rename(index=str, columns= {'regtitle':'district', 
                                                       'reglevel' : 'level',
                                                       'regtype': 'type',
                                                        'reglink': 'link',
                                                       'regp_innovation':'p_doi',
                                                       'regdate': 'date',
                                                       'regdate_p': 'date_p'})
#plans_onehot = plans_onehot.drop(columns = ['regindex'])
plans_onehot.head(10)

Unnamed: 0,district,level,type,link,p_doi,date,date_p,reg1_251,reg1_253,reg102_1301,...,reg551_083,reg551_084,reg552_023,reg552_024,reg7_056,reg75_600,reg8_210,reg921_406,reg97_1001,reg97_1055
0,Abbott ISD,Second,pdf,https://www.abbottisd.org/ourpages/auto/2018/3...,0.999949,2018.0,0.949854,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Abernathy ISD,Second,pdf,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,0.99834,2017.0,0.307943,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abilene ISD,First,pdf,https://www.abileneisd.org/wp-content/uploads/...,0.999955,2018.0,0.999955,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Academy ISD,Second,docx,https://4.files.edl.io/1a8f/06/29/18/204245-44...,0.999836,2018.0,0.348899,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adrian ISD,First,pdf,http://www.adrianisd.net/UserFiles/Servers/Ser...,0.999945,2018.0,0.999955,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Agua Dulce ISD,Second,pdf,https://tx02206063.schoolwires.net/cms/lib/TX0...,0.999955,2017.0,0.999955,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Alamo Heights ISD,Second,pdf,http://www.ahisd.net/UserFiles/Servers/Server_...,0.997152,2016.0,0.998209,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Albany ISD,Second,pdf,http://www.albanyisd.net/uploads/4/4/4/1/44419...,0.999931,2017.0,0.40463,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Aldine ISD,Second,pdf,http://insidealdine.com/wp-content/uploads/201...,0.999702,2017.0,0.999823,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Alice ISD,First,google,https://drive.google.com/file/d/1v1wysSRUqD5FX...,0.998974,2019.0,0.990232,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [247]:
plans_onehot.to_csv((os.path.join(os.getcwd(), '..', '..', 'data', 'plans', 'doi_final.csv')))

In [248]:
col_list= list(plans_onehot)

In [249]:
col_list

['district',
 'level',
 'type',
 'link',
 'p_doi',
 'date',
 'date_p',
 'reg1_251',
 'reg1_253',
 'reg102_1301',
 'reg102_1303',
 'reg102_1305',
 'reg102_1307',
 'reg102_1309',
 'reg102_1311',
 'reg102_1313',
 'reg102_1315',
 'reg102_309',
 'reg109_1001',
 'reg11_1511',
 'reg11_1512',
 'reg11_162',
 'reg11_164',
 'reg11_251',
 'reg11_252',
 'reg11_253',
 'reg11_255',
 'reg12_003',
 'reg12_0522',
 'reg12_131',
 'reg130_135',
 'reg130_231',
 'reg130_413',
 'reg132_231',
 'reg15_113',
 'reg150_1001',
 'reg150_1003',
 'reg150_1022',
 'reg150_1026',
 'reg158_211',
 'reg16_057',
 'reg19_231',
 'reg21_0003',
 'reg21_002',
 'reg21_003',
 'reg21_0031',
 'reg21_004',
 'reg21_005',
 'reg21_04',
 'reg21_0401',
 'reg21_044',
 'reg21_0442',
 'reg21_048',
 'reg21_0487',
 'reg21_049',
 'reg21_0491',
 'reg21_051',
 'reg21_052',
 'reg21_053',
 'reg21_055',
 'reg21_056',
 'reg21_057',
 'reg21_0583',
 'reg21_081',
 'reg21_0811',
 'reg21_101',
 'reg21_102',
 'reg21_106',
 'reg21_111',
 'reg21_158',
 'reg21