In [26]:
import pandas as pd
import os
import re
import spacy
import en_core_web_sm
from spacy.matcher import Matcher

In [27]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora

In [28]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
data = pd.read_csv(os.path.join(data_path, 'plans', 'doi_final_wtext.csv'),
                  sep=",")
data = data[['district', 'link', 'text', 'reg25_0811', 'reg25_081', 'reg25_0812', 'reg25_082',
            'reg25_112', 'reg25_113', 'reg25_111', 
            'reg21_003', 'reg21_053', 'reg21_057',
            'reg21_102', 'reg21_401', 'reg21_352', 'reg21_354',
            'reg25_092', 'reg37_0012', 'reg25_036']]

In [29]:
similar_laws = {'25.0811': ['25.0811', '25_081', '25.0812', '25.082'],
               '25.112': ['25.112', '25.113', '25.111'], 
               '21.003': ['21.003', '21.053', '21.057'],
               '21.102': ['21.102','21.401', '21.352'],
                '25.092': ['25.092']}
law_name = {'reg25_0812': '25.0812', 
           'reg21_003': '21.003'}

In [30]:
stubnames = sorted(
    set([match[0] for match in data.columns.str.findall(
    r'reg.*').values if match != [] ])
    )

In [31]:
long = pd.melt(data, id_vars=['district', 'link', 'text'], value_vars=stubnames)
long = long[long.value == 1]
print('length= ', len(long))
long.head()

length=  4485


Unnamed: 0,district,link,text,variable,value
1,Abernathy ISD,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,reg21_003,1
2,Abilene ISD,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,reg21_003,1
3,Academy ISD,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,reg21_003,1
4,Adrian ISD,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,reg21_003,1
5,Agua Dulce ISD,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,reg21_003,1


In [32]:
nlp = spacy.load('en_core_web_sm')
law_shapes = [i*'d' + '.' + j*'d' + k*'x' for i in range(1, 4) for j in range(3,5) for k in range(3)]
law_shape_patterns = [[{'SHAPE':shape}, {'ORTH':'%', 'OP':'!'}] for shape in law_shapes] # could add {'SHAPE':'§', 'OP':'*'},

matcher = Matcher(nlp.vocab)
matcher.add("ExplicitLaw", None, *law_shape_patterns)

In [33]:
def get_phrase(text, regulation):
    phrase = ''
    # Get location of regulation matches
    doc = nlp(text)
    matches = matcher(doc)
    locs = []
    for match in matches:
        locs.append(match[1])
    locs.append(len(doc) - 1)
    # Get phrases between one match and the next
    i = 0
    for loc in locs[0:-1]: 
        token = doc[loc].text
        if token.startswith(regulation):
            start = locs[i]
            j = i + 1
            end = locs[j]
            while doc[end].text in similar_laws[regulation]:
                j = j + 1
                end = locs[j]
            phrase = phrase + "|" + str(doc[start:end])
        i = i + 1
    return phrase
get_phrase("21.003 45.211 21.003 21.053, 21.044 This is about teacher certification okay", regulation = '21.003')

'|21.003|21.003 21.053,'

# School Start Date

In [34]:
startdate = long[long.variable == 'reg25_0811']
startdate.head()

Unnamed: 0,district,link,text,variable,value
7030,Abernathy ISD,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,reg25_0811,1
7031,Abilene ISD,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,reg25_0811,1
7032,Academy ISD,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,reg25_0811,1
7033,Adrian ISD,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,reg25_0811,1
7034,Agua Dulce ISD,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,reg25_0811,1


In [35]:
phrases = []
for text in startdate.text:
    phrase = str(get_phrase(text, '25.0811'))
    phrases.append(phrase)

In [36]:
startdate['phrase'] = phrases

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [37]:
startdate.to_csv(os.path.join(data_path, 'clean', 'phrases_calendar.csv'),
                  sep=",")

In [46]:
data = pd.read_csv(os.path.join(data_path, 'clean', 'phrases_calendar.csv'),
                  sep=",")
doc_complete = list(data['phrase'])
doc_complete = [str(i) for i in doc_complete]

['|25.0811; TEC §25.0812 TEC §25.0811 states a school district may not begin student instruction before the 4th Monday of August. TEC §25.0812 states a school district may not schedule the last day of school for students for a school year before May 15 Innovation: Abernathy ISD believes that increased flexibility in determining the start and end of the instructional calendar will increase student achievement, improve attendance, and better allow the district to meet the social and emotional needs of the students. Setting the local limits for starting school no earlier than August 1 and ending no earlier than May 1 will have the following benefits: ● The instructional days of the fall semester (finishing before the Christmas holidays) could be increased, making the number of days closer to equal the days of the spring semester. This allows for a more balanced approach to the scope and sequence of all classes, but greatly benefits single semester courses. ● Allows for a more equal distri

In [52]:
stop = stopwords.words('english')
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
broad_stop_words = ['district', 'isd', 'innovation', 'tec']
calendar_stop_words = ['first', 'day', 'instruction', 'fourth', 'monday', 'august', 'student',
#               'calendar', 'school', 'code', 'state', 'schedule', 'tec', 'may']
#stop += newStopWords
stop = set(stop)

In [53]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete] 

In [54]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [55]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [51]:
print(ldamodel.print_topics(num_topics=5, num_words=8))

[(0, '0.023*"plan" + 0.016*"member" + 0.015*"committee" + 0.014*"teacher" + 0.014*"\uf0b7" + 0.012*"board" + 0.011*"student" + 0.009*"2017"'), (1, '0.030*"student" + 0.022*"semester" + 0.017*"start" + 0.017*"year" + 0.016*"local" + 0.016*"date" + 0.016*"•" + 0.015*"day"'), (2, '0.028*"start" + 0.022*"student" + 0.021*"begin" + 0.020*"community" + 0.019*"date" + 0.019*"august" + 0.018*"local" + 0.017*"flexibility"'), (3, '0.014*"student" + 0.011*"●" + 0.009*"time" + 0.009*"board" + 0.008*"1" + 0.008*"year" + 0.007*"staff" + 0.007*"class"'), (4, '0.021*"year" + 0.018*"student" + 0.017*"teacher" + 0.015*"day" + 0.015*"flexibility" + 0.012*"allow" + 0.012*"semester" + 0.011*"start"')]


In [44]:
for topic in ldamodel.print_topics(num_topics=10, num_words=5):
    print(topic)

(0, '0.027*"local" + 0.025*"instructional" + 0.021*"community" + 0.021*"state" + 0.016*"august"')
(1, '0.041*"teacher" + 0.030*"contract" + 0.022*"probationary" + 0.021*"education" + 0.019*"year"')
(2, '0.017*"date" + 0.016*"start" + 0.014*"day" + 0.014*"semester" + 0.013*"practice"')
(3, '0.029*"begin" + 0.027*"earlier" + 0.023*"august" + 0.022*"student" + 0.019*"start"')
(4, '0.031*"student" + 0.021*"year" + 0.019*"semester" + 0.018*"•" + 0.014*"day"')
(5, '0.027*"●" + 0.013*"board" + 0.011*"student" + 0.010*"sec" + 0.009*"plan"')
(6, '0.044*"start" + 0.023*"student" + 0.022*"college" + 0.021*"date" + 0.019*"year"')
(7, '0.038*"student" + 0.028*"start" + 0.027*"date" + 0.020*"flexibility" + 0.018*"day"')
(8, '0.034*"member" + 0.030*"plan" + 0.022*"teacher" + 0.016*"committee" + 0.015*"2017"')
(9, '0.021*"board" + 0.018*"staff" + 0.016*"trustee" + 0.016*"year" + 0.016*"plan"')
