# One hot save and clean

In [18]:
import pandas as pd
import csv
import os
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from start import data_path
import extract_laws
import clean_documents
import numpy as np

In [19]:
laws_df = pd.read_csv(os.path.join(data_path, 'doi_exemptions_list.csv'))
dates_df = pd.read_csv(os.path.join(data_path, 'doi_dates.csv'))
doi_df = laws_df.merge(dates_df, left_on = 'title', right_on = 'title', how = 'left')
print(len(laws_df))
print(len(dates_df))
print(len(doi_df))
doi_df.sample(5)

824
824
824


Unnamed: 0,title,Unnamed: 0_x,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,level,type,link,text,p_innovation,possible_laws,Unnamed: 0_y,doi_date
98,Bullard ISD,725.0,2321,2321,2321,2577,Second,pdf,https://www.bullardisd.net//cms/lib6/TX0181769...,BULLARD INDEPENDENT SCHOOL DISTRICT 2017-2018 ...,0.999955,"[11.251, 11.252, 11.253, 25.092, 25.081, 21.05...",725,2017-08-01
378,Jayton-Girard ISD,445.0,2751,2751,2751,3054,Second,pdf,https://4.files.edl.io/95e6/11/01/18/153754-f1...,JAYTON-GIRARD ISD DISTRICT OF INNOVATION PLAN ...,0.999955,"[25.0811, 21.053, 21.102, 21.003, 21.057, 25.0...",445,2017-08-01
406,Knippa ISD,60.0,7,7,7,7,First,google,https://docs.google.com/viewer?a=v&pid=sites&s...,Not a plan,4.5e-05,"[21.003, 21.053, 21.04, 21.005, 25.0811, 25.08...",417,2016-08-01
722,Tarkington ISD,101.0,1915,1915,1915,2110,Second,pdf,https://s3.amazonaws.com/scschoolfiles/565/doi...,Tarkington ISD District of Innovation Local Pl...,0.999955,"[25.113, 25.0811, 25.083, 25.111, 25.081, 21.0...",101,2017-07-01
711,Stratford ISD,112.0,2967,2967,2967,3295,Second,pdf,https://s3.amazonaws.com/scschoolfiles/1795/st...,Stratford ISD District of Innovation 2017-2022...,0.999931,"[25.0811, 28.0214, 25.081, 21.055, 21.401, 25....",112,2017-08-01


## Save create one-hot variables for laws

In [20]:
def make_list(string):
    new_list = [i.replace('[','').replace(']','') for i in string.split(', ')]
    return new_list

doi_df['possible_laws'] = doi_df['possible_laws'].apply(make_list)
doi_df = doi_df[['title', 'link', 'text', 'p_innovation', 'possible_laws', 'doi_date']]
len(doi_df)

824

In [21]:
mlb = MultiLabelBinarizer()
plans_onehot = doi_df.join(pd.DataFrame(mlb.fit_transform(doi_df.pop('possible_laws')),
                          columns=mlb.classes_,
                          index=doi_df.index))
plans_onehot.head(10)
len(plans_onehot)

824

In [22]:
drop_cols = [c for c in plans_onehot.columns if c.lower()[:7] == 'unnamed']
plans_onehot = plans_onehot.drop(drop_cols, axis=1)
plans_onehot.head()
len(plans_onehot)

824

### Rename exemptions so they start with reg

In [23]:
plans_onehot.columns = ['reg' + str(i).replace('.', '_') for i in plans_onehot.columns]
plans_onehot = plans_onehot.drop(['reg'], axis=1)
plans_onehot = plans_onehot.rename(index=str, columns= {'regtitle':'district', 
                                                        'reglink': 'link',
                                                        'regtext': 'text',
                                                        'regp_innovation':'p_doi',
                                                        'regdoi_date' : 'doi_date'})
plans_onehot.head(10)

Unnamed: 0,district,link,text,p_doi,doi_date,reg11_1511,reg11_162,reg11_164,reg11_251,reg11_252,...,reg45_204,reg45_205,reg45_206,reg45_207,reg45_208,reg45_209,reg46_001,reg51_403,reg55_0811,reg97_1001
0,Abbott ISD,https://www.abbottisd.org/ourpages/auto/2018/3...,1 Abbott Independent School District District ...,0.999955,2018-03-01,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,Abernathy ISD,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,0.999904,2017-01-01,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abilene ISD,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,0.999955,2018-08-01,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Academy ISD,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,0.999208,2018-08-01,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adrian ISD,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,0.999955,2018-08-01,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Agua Dulce ISD,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,0.999955,2017-08-01,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Alamo Heights ISD,http://www.ahisd.net/UserFiles/Servers/Server_...,1 | P a g e Mission Statement The Alamo Height...,0.999955,2016-08-01,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Albany ISD,http://www.albanyisd.net/uploads/4/4/4/1/44419...,Microsoft Word - innovation_plan.docx Albany I...,0.999955,2017-06-01,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Aldine ISD,http://insidealdine.com/wp-content/uploads/201...,Aldine District of Innovation Plan Approved by...,0.999955,2017-08-01,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Aledo ISD,https://docs.google.com/viewerng/viewer?url=ht...,ALEDO Independent School District A Past to Re...,0.329979,2019-02-01,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
len(plans_onehot)

824

In [25]:
cols = [c for c in plans_onehot.columns if c.lower()[:3] == 'reg']
plans_onehot['total'] = plans_onehot[cols].sum(axis = 1)
print(len(plans_onehot))
# drop documents with missing regulations
#plans_onehot = plans_onehot[plans_onehot.total > 0]
print(len(plans_onehot))

824
824


In [26]:
# replace values
plans_onehot['reg25_0811'] = np.where((plans_onehot.reg25_081 == 1), 1, plans_onehot.reg25_0811)

# Save

In [27]:
plans_onehot.to_csv((os.path.join(data_path, 'doi_final_wtext.csv')))
plans_onehot = plans_onehot.drop(['text'], axis = 1)
plans_onehot.to_csv((os.path.join(data_path, 'doi_final.csv')))

In [28]:
df = pd.read_csv((os.path.join(data_path, 'doi_final.csv')))

In [29]:
plans_onehot.reg25_0811.mean()

0.970873786407767

In [30]:
df.reg25_0811.mean()

0.970873786407767

In [31]:
# Just certification, not other inputs
len(df[(df.reg21_003 == 1) & (df.reg25_112 == 0) & (df.reg25_081 == 0)])

214

In [32]:
# Just class size, not other inputs
len(df[(df.reg21_003 == 0) & (df.reg25_112 == 1) & (df.reg25_081 == 0)])

18

In [33]:
# Just minutes of operation, not other inputs
len(df[(df.reg21_003 == 0) & (df.reg25_112 == 0) & (df.reg25_081 == 1)])

18

In [34]:
len(df)

824