# One hot save and clean

In [1]:
import pandas as pd
import csv
import os
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from start import data_path
import extract_laws
import clean_documents
import numpy as np

  from collections import Sequence


In [2]:
laws_df = pd.read_csv(os.path.join(data_path, 'doi_exemptions_list.csv'))
dates_df = pd.read_csv(os.path.join(data_path, 'doi_dates.csv'))
doi_df = laws_df.merge(dates_df, left_on = 'title', right_on = 'title', how = 'left')
print(len(laws_df))
print(len(dates_df))
print(len(doi_df))
doi_df.sample(5)

824
824
824


Unnamed: 0,title,Unnamed: 0_x,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,level,type,link,text,p_innovation,possible_laws,Unnamed: 0_y,doi_year
443,Linden-Kildare CISD,380.0,1688,1688,1688,1860,Second,pdf,https://s3.amazonaws.com/scschoolfiles/931/lin...,Linden Kildare CISD Local Innovation Plan: 1 P...,0.999658,"[25.0811, 21.003, 21.102]",380,2017.0
211,Dilley ISD,612.0,935,935,935,1012,Second,pdf,https://core-docs.s3.amazonaws.com/documents/a...,DILLEY INDEPENDENT SCHOOL DISTRICT “DISTRICT O...,0.999955,"[25.0811, 11.253, 11.255, 45.206, 11.252, 45.2...",612,2018.0
547,Northwest ISD,276.0,96,96,96,96,First,pdf,https://www.nisdtx.org/UserFiles/Servers/Serve...,i District of Innovation ii NORTHWEST INDEPEND...,0.999394,"[37.0012, 37.105, 25.092, 21.053, 21.458, 21.0...",276,2019.0
709,Sterling City ISD,44.0,166,166,166,166,First,pdf,http://www.sterlingcityisd.net/cms/lib/TX01001...,Adopted by the Sterling City ISD Board of Trus...,4.5e-05,"[25.113, 25.0811, 21.401, 21.003, 21.057, 21.0...",114,2017.0
50,Beeville ISD,101.0,274,274,274,274,First,pdf,https://s3.amazonaws.com/scschoolfiles/380/bis...,Not a plan,4.5e-05,"[21.003, 21.044, 25.0811, 25.0812, 21.203, 21....",773,2017.0


## Save create one-hot variables for laws

In [3]:
def make_list(string):
    new_list = [i.replace('[','').replace(']','') for i in string.split(', ')]
    return new_list

doi_df['possible_laws'] = doi_df['possible_laws'].apply(make_list)
doi_df = doi_df[['title', 'link', 'text', 'p_innovation', 'possible_laws', 'doi_year']]
len(doi_df)

824

In [4]:
mlb = MultiLabelBinarizer()
plans_onehot = doi_df.join(pd.DataFrame(mlb.fit_transform(doi_df.pop('possible_laws')),
                          columns=mlb.classes_,
                          index=doi_df.index))
plans_onehot.head(10)
len(plans_onehot)

824

In [5]:
drop_cols = [c for c in plans_onehot.columns if c.lower()[:7] == 'unnamed']
plans_onehot = plans_onehot.drop(drop_cols, axis=1)
plans_onehot.head()
len(plans_onehot)

824

### Rename exemptions so they start with reg

In [6]:
plans_onehot.columns = ['reg' + str(i).replace('.', '_') for i in plans_onehot.columns]
plans_onehot = plans_onehot.drop(['reg'], axis=1)
plans_onehot = plans_onehot.rename(index=str, columns= {'regtitle':'district', 
                                                        'reglink': 'link',
                                                        'regtext': 'text',
                                                        'regp_innovation':'p_doi',
                                                        'regdoi_year' : 'doi_year'})
plans_onehot.head(10)

Unnamed: 0,district,link,text,p_doi,doi_year,reg11_1511,reg11_162,reg11_164,reg11_251,reg11_252,...,reg45_204,reg45_205,reg45_206,reg45_207,reg45_208,reg45_209,reg46_001,reg51_403,reg55_0811,reg97_1001
0,Abbott ISD,https://www.abbottisd.org/ourpages/auto/2018/3...,1 Abbott Independent School District District ...,0.999955,2018.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,Abernathy ISD,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,0.999904,2017.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abilene ISD,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,0.999955,2018.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Academy ISD,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,0.999208,2018.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adrian ISD,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,0.999955,2018.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Agua Dulce ISD,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,0.999955,2017.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Alamo Heights ISD,http://www.ahisd.net/UserFiles/Servers/Server_...,1 | P a g e Mission Statement The Alamo Height...,0.999955,2016.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Albany ISD,http://www.albanyisd.net/uploads/4/4/4/1/44419...,Microsoft Word - innovation_plan.docx Albany I...,0.999955,2017.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Aldine ISD,http://insidealdine.com/wp-content/uploads/201...,Aldine District of Innovation Plan Approved by...,0.999955,2017.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Aledo ISD,https://docs.google.com/viewerng/viewer?url=ht...,ALEDO Independent School District A Past to Re...,0.329979,2019.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
len(plans_onehot)

824

In [8]:
cols = [c for c in plans_onehot.columns if c.lower()[:3] == 'reg']
plans_onehot['total'] = plans_onehot[cols].sum(axis = 1)
print(len(plans_onehot))
# drop documents with missing regulations
#plans_onehot = plans_onehot[plans_onehot.total > 0]
print(len(plans_onehot))

824
824


In [9]:
# replace values
plans_onehot['reg25_0811'] = np.where((plans_onehot.reg25_081 == 1), 1, plans_onehot.reg25_0811)

# Save

In [10]:
plans_onehot.to_csv((os.path.join(data_path, 'doi_final_wtext.csv')))
plans_onehot = plans_onehot.drop(['text'], axis = 1)
plans_onehot.to_csv((os.path.join(data_path, 'doi_final.csv')))

In [11]:
df = pd.read_csv((os.path.join(data_path, 'doi_final.csv')))

In [12]:
plans_onehot.reg25_0811.mean()

0.970873786407767

In [13]:
df.reg25_0811.mean()

0.970873786407767

In [14]:
# Just certification, not other inputs
len(df[(df.reg21_003 == 1) & (df.reg25_112 == 0) & (df.reg25_081 == 0)])

214

In [15]:
# Just class size, not other inputs
len(df[(df.reg21_003 == 0) & (df.reg25_112 == 1) & (df.reg25_081 == 0)])

18

In [16]:
# Just minutes of operation, not other inputs
len(df[(df.reg21_003 == 0) & (df.reg25_112 == 0) & (df.reg25_081 == 1)])

18

In [17]:
len(df)

824