In [25]:
import pandas as pd
import os
import re
import spacy
import en_core_web_sm
from spacy.matcher import Matcher

In [26]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora

In [27]:
data_path = '/Users/kylieleblancKylie/domino/dofis/data/'
data = pd.read_csv(os.path.join(data_path, 'plans', 'doi_final_wtext.csv'),
                  sep=",")
data = data[['district', 'link', 'text', 'reg25_0811', 'reg25_081', 'reg25_0812', 'reg25_082',
            'reg25_112', 'reg25_113', 'reg25_111', 
            'reg21_003', 'reg21_053', 'reg21_057',
            'reg21_102', 'reg21_401', 'reg21_352', 'reg21_354',
            'reg25_092', 'reg37_0012', 'reg25_036']]

In [28]:
similar_laws = {'25.0811': ['25.0811', '25_081', '25.0812', '25.082'],
               '25.112': ['25.112', '25.113', '25.111'], 
               '21.003': ['21.003', '21.053', '21.057'],
               '21.102': ['21.102','21.401', '21.352'],
                '25.092': ['25.092']}
law_name = {'reg25_0812': '25.0812', 
           'reg21_003': '21.003'}

In [29]:
stubnames = sorted(
    set([match[0] for match in data.columns.str.findall(
    r'reg.*').values if match != [] ])
    )

In [30]:
long = pd.melt(data, id_vars=['district', 'link', 'text'], value_vars=stubnames)
long = long[long.value == 1]
print('length= ', len(long))
long.head()

length=  4485


Unnamed: 0,district,link,text,variable,value
1,Abernathy ISD,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,reg21_003,1
2,Abilene ISD,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,reg21_003,1
3,Academy ISD,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,reg21_003,1
4,Adrian ISD,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,reg21_003,1
5,Agua Dulce ISD,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,reg21_003,1


In [31]:
nlp = spacy.load('en_core_web_sm')
law_shapes = [i*'d' + '.' + j*'d' + k*'x' for i in range(1, 4) for j in range(3,5) for k in range(3)]
law_shape_patterns = [[{'SHAPE':shape}, {'ORTH':'%', 'OP':'!'}] for shape in law_shapes] # could add {'SHAPE':'§', 'OP':'*'},

matcher = Matcher(nlp.vocab)
matcher.add("ExplicitLaw", None, *law_shape_patterns)

In [32]:
def get_phrase(text, regulation):
    phrase = ''
    # Get location of regulation matches
    doc = nlp(text)
    matches = matcher(doc)
    locs = []
    for match in matches:
        locs.append(match[1])
    locs.append(len(doc) - 1)
    # Get phrases between one match and the next
    i = 0
    for loc in locs[0:-1]: 
        token = doc[loc].text
        if token.startswith(regulation):
            start = locs[i]
            j = i + 1
            end = locs[j]
            while doc[end].text in similar_laws[regulation]:
                j = j + 1
                end = locs[j]
            phrase = phrase + "|" + str(doc[start:end])
        i = i + 1
    return phrase
get_phrase("21.003 45.211 21.003 21.053, 21.044 This is about teacher certification okay", regulation = '21.003')

'|21.003|21.003 21.053,'

# Certification

In [33]:
certification = long[long.variable == 'reg21_003']
certification.head()

Unnamed: 0,district,link,text,variable,value
1,Abernathy ISD,https://1.cdn.edl.io/epGD4mZXjWUcPl8yA7IhlcDol...,Abernathy ISD District of Innovation Plan – Dr...,reg21_003,1
2,Abilene ISD,https://www.abileneisd.org/wp-content/uploads/...,0 ABILENE INDEPENDENT SCHOOL DISTRICT Local In...,reg21_003,1
3,Academy ISD,https://4.files.edl.io/1a8f/06/29/18/204245-44...,District of Innovation Plan Developed in coope...,reg21_003,1
4,Adrian ISD,http://www.adrianisd.net/UserFiles/Servers/Ser...,Adrian ISD District of Innovation Plan House B...,reg21_003,1
5,Agua Dulce ISD,https://tx02206063.schoolwires.net/cms/lib/TX0...,Microsoft Word - DOI 2017-2022.docx Agua Dulce...,reg21_003,1


In [46]:
phrases = []
for text in certification.text:
    phrase = str(get_phrase(text, '21.003'))
    phrases.append(phrase)

In [47]:
certification['phrase'] = phrases

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [48]:
certification.to_csv(os.path.join(data_path, 'clean', 'certification_phrases.csv'),
                  sep=",")

In [65]:
data = pd.read_csv(os.path.join(data_path, 'clean', 'certification_phrases.csv'),
                  sep=",")
doc_complete = list(data['phrase'])
doc_complete = [str(i) for i in doc_complete]

In [71]:
stop = stopwords.words('english')
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
broad_stop_words = ['district', 'isd', 'innovation', 'tec']
certification_stop_words = ['certification','teach', 'teacher', 'staff', 'certificate', 'certified']
stop += broad_stop_words
stop += certification_stop_words
stop = set(stop)

In [72]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete] 

['21003',
 'tec§',
 '21057',
 'principal',
 'may',
 'submit',
 'superintendent',
 'andor',
 'superintendent’s',
 'designee',
 'request',
 'teaching',
 'permit',
 'local',
 'certification',
 'outlining',
 'individual’s',
 'credentialsqualifications',
 'qualification',
 'local',
 'could',
 'include',
 'limited',
 'to',
 '●',
 'professional',
 'work',
 'experience',
 '●',
 'formal',
 'trainingeducation',
 'content',
 'area',
 '●',
 'activerelevant',
 'professional',
 'industry',
 'certificationregistration',
 '●',
 'combination',
 'work',
 'experience',
 'training',
 'education',
 '●',
 'demonstration',
 'successful',
 'experience',
 'working',
 'student',
 'superintendent',
 'hisher',
 'designee',
 'approve',
 'request',
 'believe',
 'individual',
 'posse',
 'knowledge',
 'skill',
 'experience',
 'required',
 'position',
 'feel',
 'individual',
 'could',
 'asset',
 'student',
 'employment',
 'dependent',
 'upon',
 'board',
 'approval',
 'employee',
 'working',
 'teaching',
 'permit',
 'l

In [73]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [74]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

In [75]:
print(ldamodel.print_topics(num_topics=10, num_words=8))

[(0, '0.042*"course" + 0.022*"requirement" + 0.021*"professional" + 0.020*"cte" + 0.018*"science" + 0.018*"industry" + 0.015*"subject" + 0.014*"student"'), (1, '0.034*"course" + 0.031*"requirement" + 0.020*"student" + 0.018*"teacher" + 0.018*"hire" + 0.016*"professional" + 0.012*"educational" + 0.012*"exemption"'), (2, '0.019*"would" + 0.018*"education" + 0.017*"industry" + 0.015*"experience" + 0.015*"school" + 0.014*"individual" + 0.012*"local" + 0.012*"course"'), (3, '0.013*"local" + 0.013*"education" + 0.013*"student" + 0.012*"teacher" + 0.009*"board" + 0.009*"course" + 0.009*"exemption" + 0.008*"texas"'), (4, '0.043*"nan" + 0.016*"training" + 0.009*"employee" + 0.008*"legal" + 0.008*"area" + 0.007*"policy" + 0.006*"agreement" + 0.006*"local"'), (5, '0.035*"school" + 0.028*"21003" + 0.021*"person" + 0.017*"required" + 0.015*"b" + 0.014*"subchapter" + 0.012*"grade" + 0.011*"a"'), (6, '0.132*"•" + 0.014*"teacher" + 0.012*"student" + 0.011*"need" + 0.010*"–" + 0.009*"school" + 0.008*"c

In [None]:
for topic in ldamodel.print_topics(num_topics=10, num_words=5):
    print(topic)