In [1]:
# finds key words in state tech plans that are used to create that csv file

import spacy
import pytextrank
import os
import nltk
#nltk.download('stopwords')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

# path to folder with text files
text_file_location = 'C:/Users/tasbe/OneDrive/Desktop/job_search/career_change/portfolio_projects/covid_learning/research/state_info/text_versions'  

# words to remove from text processing, chosen because they are common and not useful for finding features
stop_list = ['technology', 'school', 'district', 'state', 'california', 'new york', 'missouri', 'ohio', 'washington', 
            'technologies', 'illinois', 'report', "indiana", "massachussets", "middle", "high", "elementary", "grade", 
             "districts",'digital', 'education', 'use', 'computer', 'schools', ".", ',', 'grade', 'students', 'student', 
             'superintendent', 'educational', " ", 'learning', 'public', 'blueprint', 'recommendations', 'recommendation',
             'association']

#set-up natural language processing pipeline for English
nlp = spacy.load('en_core_web_sm')

# read-in all text files and create one text string from them all
def text_from_files(text_location): 
    
    location = os.chdir(text_location) #moves directory location to this location to get files
    text = ""  # to add files to
    files = {} #list to store all csv files found at location

    for file in os.listdir(location):
        try:
            f = open(file, "r", encoding='utf-8',
                     errors='ignore')
            text += f.read().lower()
            f.close()
        except Exception as e:
            raise e
            print("No files found here!")
        
    return text

# removes stop words from text document
def remove_stop_words(stop_list, text_str):
    stop_words = stopwords.words('english')
    stop_words.extend(stop_list)
    text_list = text_str.split()
    word_list = [word for word in text_list if not word in stop_words]
    text_no_stop = " ".join(word_list)
    return text_no_stop

# removes entities from text document; avoids dates, cities, people from being added to phrases
def remove_entities(text_str):
    doc = nlp(text_str)
    entity_list = []
    for ent in doc.ents:
        entity_list.append(ent.text)
    word_list = text_str.split()    
    list_no_ents = [word for word in word_list if not word in entity_list]
    text_no_ents = " ".join(list_no_ents)
    return text_no_ents

# add phrases that are longer than 30 characters and ranked higher than .03 to a list
def get_phrases(text_str):
    nlp.add_pipe("textrank", last = True)
    doc_clean = nlp(text_str)
    
    phrases = []    
    long_phrases = []
    for phrase in doc_clean._.phrases:
            if len(phrase.text) < 30 and phrase.rank > .03:
                phrases.append(phrase.text)
            else:
                long_phrases.append(phrase.text)
                
    next_phrases = []
    for phrase in long_phrases:
        for item in doc_clean._.phrases:
            if len(item.text) < 30 and item.rank > .03:
                next_phrases.append(item.text)

    phrases += next_phrases
    
    return phrases

# processes each file
text = text_from_files(text_file_location)
text_no_stop = remove_stop_words(stop_list, text)
text_clean = remove_entities(text_no_stop)
phrases = get_phrases(text_clean)
phrase_list = [[phrase] for phrase in phrases]



['empowering', '2014–2017', 'instruction', 'tom', 'torlakson', 'initiative', 'april', '2014', 'empowering', 'learning:', 'blueprint,', '2014–2017,', 'product', 'superintendent’s', 'initiatives', 'office', 'department', 'direction', 'instruction', 'tom', 'torlakson', 'collaboration', 'department', 'education’s', 'data', 'management', 'division.', 'table', 'contents', 'message', 'instruction', '................', '1', 'executive', 'summary', 'context', '.............................................', '3', 'task', 'force...................................................................', '8', '...........................................................................', '10', 'learning...................................................................................................................14', 'teaching...................................................................................................................15', 'assessment................................................

In [2]:
# The code in the next two cells was taken from:
# https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

# Create Dictionary
id2word = corpora.Dictionary(phrase_list)
# Create Corpus
texts = phrase_list
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.010*"achievement standards" + 0.010*"model age work learning" + '
  '0.010*"creation technologies" + 0.010*"simple problems" + 0.010*"science '
  'courses" + 0.010*"information broken packets" + 0.010*"integrate practices" '
  '+ 0.010*"webbased teaching learning" + 0.010*"interrelationship technology" '
  '+ 0.010*"technology fluency"'),
 (1,
  '0.010*"interest level design" + 0.010*"content skills" + 0.010*"new '
  'knowledge" + 0.010*"maintenance technical support" + 0.010*"simple hardware '
  'problems" + 0.010*"strong support strategies" + 0.010*"possible problems" + '
  '0.010*"various tools" + 0.010*"future transformed technology" + '
  '0.010*"tools guidance"'),
 (2,
  '0.010*"science social studies" + 0.010*"lms research tools" + '
  '0.010*"science programming" + 0.010*"information life cycle" + '
  '0.010*"course database systems" + 0.010*"revised standards grades" + '
  '0.010*"effective instruction" + 0.010*"school decisions" + 0.010*"standards '
  'organized sec

In [3]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join(text_file_location + str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, text_file_location + str(num_topics) +'.html')
LDAvis_prepared

In [5]:
# search documents for key words. Create a boolean value to show if key word is present in document or not
import pandas as pd

key_words = ['technology literacy', 'digital literacy','digital literacy classrooms', 
'computer science classrooms', 'problem solving skills', 'digital age learning culture',
'equitable resources', 'technology investment', 'infrastructure investment',
'technology skills gap', 'technology-related goals', 'student-centered',
'classroom resources', 'teacher workload', 'digital privacy', 'data driven instruction', 
'computer science standards', 'learning outcomes', 'transform learning',
'flexible learning environments', 'authentic learning', 'rich digital resources',
'digital citizenship', 'classroom technology', 'school decisions', 'technology skills instruction',
'professional development', 'professional learning programs', 'technology gaps', 'digital learning tools',
'digital learning readiness', 'effective technology integration', '21st century schools'
'digital learning specialist', 'technology coordinators', 'chief technology officer', 'instructional technology specialist',
'student access', 'improved student time', 'student thinking', 'personalized learning',
'individual student needs', 'computer science education', 'professional learning communities',
'digital resources', 'district strategic plans', 'new instructional practices', 'educational technology department',
'data management', 'technical assistance', 'college', 'careers', 'consultant', 'teacher librarian',
'teacher collaboration', 'formative assessment', 'common core']

states = ['california', 'connecticut', 'illinois', 'indiana', 'massachussets', 'missouri', 'new_york', 'ohio']
state_plan_dict = {}

for file in os.listdir(text_file_location):
    try:
        f_text = ''
        f = open(file, "r", encoding='utf-8',
                 errors='ignore')
        f_text += f.read().lower()
        for word in key_words:
            if word in f_text:
                state_plan_dict.setdefault(file, {})[word] = 1
            else: 
                state_plan_dict.setdefault(file, {})[word] = 0
    except Exception as e:
        raise e
        print("Sorry, didn't work!")
        
state_plans = pd.DataFrame.from_dict(state_plan_dict, orient='index')
state_plans.to_csv("state_plans2.csv")

  and should_run_async(code)
