In [1]:
import sys

import pandas as pd
import os
import re
import spacy
import en_core_web_sm
from spacy.matcher import Matcher
from IPython.display import clear_output

from dofis.analysis.library import regulations
import breaktext

In [6]:
data_path = '/Users/kylieanglin/Library/CloudStorage/Dropbox/Active/Research/dofis/data/'
data = pd.read_csv(data_path + "plans/doi_final_wtext.csv",
                  sep=",")
data = data[['district', 'link', 'text',
      'reg25_0811', 'reg25_081', 'reg25_0812', 'reg25_082',
      'reg21_003', 'reg21_053', 'reg21_057',
      'reg21_102', 'reg21_401', 'reg21_352', 'reg21_354',
      'reg25_092', 'reg37_0012', 'reg25_036', 'reg25_112']]
data.head(1)

Unnamed: 0,district,link,text,reg25_0811,reg25_081,reg25_0812,reg25_082,reg21_003,reg21_053,reg21_057,reg21_102,reg21_401,reg21_352,reg21_354,reg25_092,reg37_0012,reg25_036,reg25_112
0,Abbott ISD,https://www.abbottisd.org/ourpages/auto/2018/3...,1 Abbott Independent School District District ...,1,0,0,0,1,0,0,1,0,0,0,0,1,1,1


In [7]:
regulations.similar

{'25.0811': ['25.0811', '25_081', '25.0812'],
 '25.112': ['25.112', '25.113', '25.111'],
 '21.003': ['21.003', '21.053', '21.044', '21.057', '21.055'],
 '21.102': ['21.102'],
 '21.401': ['21.401'],
 '25.092': ['25.092'],
 '21.352': ['21.352'],
 '21.354': ['21.354'],
 '37.0012': ['37.0012'],
 '25.036': ['25.036']}

In [8]:
stubnames = sorted(
    set([match[0] for match in data.columns.str.findall(
    r'reg.*').values if match != [] ])
    )
long = pd.melt(data, id_vars=['district', 'link', 'text'], value_vars=stubnames)
long = long[long.value == 1]
print('length= ', len(long))
long.head(1)

length=  5301


Unnamed: 0,district,link,text,variable,value
0,Abbott ISD,https://www.abbottisd.org/ourpages/auto/2018/3...,1 Abbott Independent School District District ...,reg21_003,1


In [9]:
breaktext.get_phrase("21.003 45.211 21.003 21.053, 21.044 This is about teacher certification okay", regulation = '21.003')

'|21.003|21.003 21.053, 21.044 This is about teacher certification'

# Schedules

In [None]:
startdate = long[long.variable == 'reg25_0811']
phrases = []
for text in startdate.text:
    try:
        phrase = breaktext.get_phrase(text, '25.0811', include_similar = False)
        phrases.append(phrase)
    except:
        phrases.append("")
startdate['phrase'] = phrases
startdate.to_csv(os.path.join(data_path, 'clean', 'phrases_startdate.csv'),
                  sep=",")

In [7]:
startdate = long[long.variable == 'reg25_081']
phrases = []
for text in startdate.text:
    phrase = breaktext.get_phrase(text, '25.081', include_similar = False)
    phrases.append(phrase)
startdate['phrase'] = phrases
startdate.to_csv(os.path.join(data_path, 'clean', 'phrases_minutes.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Certification

In [8]:
certification = long[long.variable == 'reg21_003']
phrases = []
for text in certification.text:
    phrase = breaktext.get_phrase(text, '21.003')
    phrases.append(phrase)
certification['phrase'] = phrases
certification.to_csv(os.path.join(data_path, 'clean', 'phrases_certification.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Class Size

In [9]:
classsize = long[long.variable == 'reg25_112']
phrases = []
for text in classsize.text:
    phrase = breaktext.get_phrase(text, '25.112')
    phrases.append(phrase)
classsize['phrase'] = phrases
classsize.to_csv(os.path.join(data_path, 'clean', 'phrases_classsize.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Contracts

In [10]:
probation = long[long.variable == 'reg21_102']
phrases = []
for text in probation.text:
    phrase = breaktext.get_phrase(text, '21.102')
    phrases.append(phrase)
probation['phrase'] = phrases
probation.to_csv(os.path.join(data_path, 'clean', 'phrases_probation.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
service = long[long.variable == 'reg21_401']
phrases = []
for text in service.text:
    phrase = breaktext.get_phrase(text, '21.401')
    phrases.append(phrase)
service['phrase'] = phrases
service.to_csv(os.path.join(data_path, 'clean', 'phrases_service.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
teacherevals = long[long.variable == 'reg21_352']
phrases = []
for text in teacherevals.text:
    phrase = breaktext.get_phrase(text, '21.352')
    phrases.append(phrase)
teacherevals['phrase'] = phrases
teacherevals.to_csv(os.path.join(data_path, 'clean', 'phrases_teacherevals.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Behavior

In [13]:
attendance = long[long.variable == 'reg25_092']
phrases = []
for text in attendance.text:
    phrase = breaktext.get_phrase(text, '25.092')
    phrases.append(phrase)
attendance['phrase'] = phrases
attendance.to_csv(os.path.join(data_path, 'clean', 'phrases_attendance.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
coordinator = long[long.variable == 'reg37_0012']
phrases = []
for text in coordinator.text:
    phrase = breaktext.get_phrase(text, '37.0012')
    phrases.append(phrase)
coordinator['phrase'] = phrases
coordinator.to_csv(os.path.join(data_path, 'clean', 'phrases_coordinator.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
transfers = long[long.variable == 'reg25_036']
phrases = []
for text in transfers.text:
    phrase = breaktext.get_phrase(text, '25.036')
    phrases.append(phrase)
transfers['phrase'] = phrases
transfers.to_csv(os.path.join(data_path, 'clean', 'phrases_transfers.csv'),
                  sep=",")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
