# Web Scraping for MDE data

In this note book we will explore downloading Minnesota Department of Education data and looking up the ACT code for schools missing that code. For this script you will need to have the packages below as well as a geckodriver with a path available to where this script is being executed. A web browser will be launched to select the actions needed to acquire the department of education data and Selenium will be used with a Python wrapper. 

In [1]:
## Read into memory the classes and functions needed

from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.common.keys import Keys
import time
import os
from socket import socket
import pandas as pd
import wget
import xlrd
import xlwt
from xlrd.sheet import ctype_text 
import datetime
import shutil
#import messytables

In [2]:
# Set the preferences for the firefox web browser
fp = webdriver.FirefoxProfile()
fp.set_preference('browser.download.folderList', 2)
fp.set_preference('browser.download.manager.showWhenStarting', False)
fp.set_preference('browser.download.dir', '/tmp')
fp.set_preference("http.response.timeout", 300)
fp.set_preference("dom.max_script_run_time", 300)
fp.set_preference('webdriver.load.strategy', 'unstable')

#fp.set_preference("browser.helperApps.neverAsk.openFile", "application/octet-stream");
#fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream");

fp.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/plain, application/vnd.ms-excel, text/csv, text/comma-separated-values, application/octet-stream, application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
fp.update_preferences()

In [3]:
# MDE web app for hs enrollment data
searchURL = 'http://w20.education.state.mn.us/MDEAnalytics/Data.jsp'

In [4]:
# Launch web browser and navigate to the searchURL
driver = webdriver.Firefox(fp)
driver.get(searchURL)

In [None]:
# switch into the frame titled 'floatframe'
driver.switch_to.frame(driver.find_element_by_name('floatframe'))

In [None]:
# find the element by xpath that has the text 'Student'
ids_a = driver.find_elements_by_xpath("//*[text()='Student']")

In [None]:
# extract the link into an object
for ii in ids_a:
    link = ii.get_attribute('href')
time.sleep(4)

In [None]:
print(link)

In [None]:
#open the link to Student level data reporting on the MDE site
driver.get(link)
time.sleep(3)

In [None]:
# Switch into the frame titled 'floatframe'
driver.switch_to.frame(driver.find_element_by_name('floatframe'))

In [None]:
# Click the button to display all data reporting around Students
driver.find_element_by_xpath("//input[@id='button1' and @name='button1']").click()

In [None]:
# Switch into the frame from the output of the button click
driver.switch_to.frame(driver.find_element_by_name('Report'))

## Create new folders if they don't already exist for MDE, or however many states. 

Can also keep everything in one folder as well.

In [None]:
newpath_main = r'C:\MDE_Data'
if not os.path.exists(newpath_main):
    os.makedirs(newpath_main)   
os.chdir(newpath_main)
os.getcwd()

In [None]:
# Can probably remove this sub-dir

In [6]:
EthnicityGender = r'C:\MDE_Data\Enrollment_EthnicityGender'
if not os.path.exists(EthnicityGender):
    os.makedirs(EthnicityGender)
    
os.chdir(EthnicityGender)
os.getcwd()

'C:\\MDE_Data\\Enrollment_EthnicityGender'

In [None]:
# Find all Enrollment by Ethnicity/Gender datasets

TReport_href_Enrollment_by_EthnicityGender = driver.find_elements_by_xpath('//a[@href and contains(@title,"Enrollment by Ethnicity/Gender")]')

In [9]:
# Create a rolling window object to select a subset of files to download.
CurrentTime = datetime.datetime.now()
ThisYearFile = str(CurrentTime.year - 1) + '-'
ThisYearFile
#for ii in TReport_href_Enrollment_by_EthnicityGender:
#    if ThisYearFile in 
LastYearFile = str(CurrentTime.year - 2) + '-'
LastYearFile
#TwoYearsAgo = print(str(CurrentTime.year - 3) + '-')
#TwoYearsAgo
#ThreeYearsAgo = print(str(CurrentTime.year - 4) + '-')
#ThreeYearsAgo
#FourYearsAgo = print(str(CurrentTime.year - 5) + '-')
#FourYearsAgo

'2016-'

In [10]:
ThisYearFile

'2017-'

In [None]:
TReport_href_Enrollment_by_EthnicityGender[0].get_attribute('title')

In [None]:
 # All Enrollment by Ethnicity/Gender datasets
downloadsTitle = []
downloadURL = []
for ii in TReport_href_Enrollment_by_EthnicityGender:
    if ThisYearFile in ii.get_attribute('title') or LastYearFile in ii.get_attribute('title'):
        wget.download(ii.get_attribute('href'), print(ii.get_attribute('title')))
        downloadsTitle.append(ii.get_attribute('title'))
        downloadURL.append(ii.get_attribute('href'))

        

In [11]:
ThisYearFileExtended = ThisYearFile + str(CurrentTime.year)

In [12]:
ThisYearFileExtended

'2017-2018'

In [13]:
CurrentSeniorsFile = ThisYearFileExtended + ' Enrollment by Ethnicity_Gender.xlsx'

In [14]:
CurrentSeniorsFile

'2017-2018 Enrollment by Ethnicity_Gender.xlsx'

In [15]:
os.chdir(EthnicityGender)
CurrentSeniorsPublic = pd.read_excel(CurrentSeniorsFile, sheetname = 'School')

In [16]:
LastYearFileExtended = LastYearFile + str(CurrentTime.year - 1)

In [17]:
LastYearFileExtended

'2016-2017'

In [None]:
# GET MDE Lookup Table

In [None]:
wget.download('http://w20.education.state.mn.us/MdeOrgView/tag/extractContacts/MDEORG_DISTRICT_SCHOOL?description=')

In [18]:
import datetime
filename =  'School_' + datetime.datetime.today().strftime('%m-%d-%Y') + '.csv'

In [23]:
filename = 'School_06-05-2018' + '.csv'

In [24]:
LookUp = pd.read_csv(filename)

In [25]:
CurrentSeniorsPublic.columns

Index(['DataYear', 'DistrictCountyNumber', 'DistrictCountyName',
       'districtNumber', 'districtType', 'DistrictName', 'schoolNumber',
       'SchoolName', 'ECSUNumber', 'EconomicDevelopmentRegion', 'Grade',
       'AMI_Male', 'AMI_Female', 'ASI_Male', 'ASI_Female', 'BLK_Male',
       'BLK_Female', 'HIS_Male', 'HIS_Female', 'HPI_Male', 'HPI_Female',
       'MLT_Male', 'MLT_Female', 'WHT_Male', 'WHT_Female', 'TotalMale',
       'TotalFemale', 'TotalMinority', 'TotalStudents', 'Entity',
       'MinMaxGrade', 'SchoolLocationCountyNumber', 'SchoolLocationCountyName',
       'SchoolClassification'],
      dtype='object')

In [26]:
LookUp.columns

Index(['District Number', 'District Type', 'School Number', 'Number',
       'Organization', 'Title', 'Name', 'First Name', 'Last Name', 'Phone',
       'Email', 'Mailing Line 1', 'Mailing Line 2', 'Mailing City',
       'Mailing State', 'Mailing Zip', 'Physical Line 1', 'Physical Line 2',
       'Physical City', 'Physical State', 'Physical Zip', 'County', 'Grades',
       'School Classification', 'Eco Dev Region', 'NCES ID',
       'StateOrganizationId', 'ACT ID', 'Web URL', 'Data Extracted'],
      dtype='object')

In [28]:
PreACT = pd.merge(CurrentSeniorsPublic, LookUp, how = "inner"
                  , left_on = ['districtNumber', 'districtType', 'schoolNumber']
                  , right_on = ['District Number', 'District Type', 'School Number'])

In [29]:
PreACT.columns

Index(['DataYear', 'DistrictCountyNumber', 'DistrictCountyName',
       'districtNumber', 'districtType', 'DistrictName', 'schoolNumber',
       'SchoolName', 'ECSUNumber', 'EconomicDevelopmentRegion', 'Grade',
       'AMI_Male', 'AMI_Female', 'ASI_Male', 'ASI_Female', 'BLK_Male',
       'BLK_Female', 'HIS_Male', 'HIS_Female', 'HPI_Male', 'HPI_Female',
       'MLT_Male', 'MLT_Female', 'WHT_Male', 'WHT_Female', 'TotalMale',
       'TotalFemale', 'TotalMinority', 'TotalStudents', 'Entity',
       'MinMaxGrade', 'SchoolLocationCountyNumber', 'SchoolLocationCountyName',
       'SchoolClassification', 'District Number', 'District Type',
       'School Number', 'Number', 'Organization', 'Title', 'Name',
       'First Name', 'Last Name', 'Phone', 'Email', 'Mailing Line 1',
       'Mailing Line 2', 'Mailing City', 'Mailing State', 'Mailing Zip',
       'Physical Line 1', 'Physical Line 2', 'Physical City', 'Physical State',
       'Physical Zip', 'County', 'Grades', 'School Classification',
  

In [30]:
missingACT = PreACT[(PreACT['ACT ID'].isnull() == True) & (PreACT.Grade == '11')]

In [31]:
ACT = PreACT[(PreACT['ACT ID'].isnull() == False) & (PreACT.Grade == '11')]

In [32]:
len(missingACT)

316

In [33]:
len(ACT)

555

In [34]:
# Pre-Lookup missing ACT

In [35]:
ActLookupRequest = missingACT[['SchoolName', 'Physical City']]

In [36]:
State = 'Minnesota'

In [37]:
ActLookupRequest['State'] = State

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
actURL = 'https://www.act.org/content/act/en/products-and-services/the-act/registration/high-school-codes-lookup.html'

In [45]:
ActLookupRequest = ActLookupRequest.reset_index(drop = True)

In [46]:
import numpy as np
from PIL import ImageGrab
import cv2
import time
import pyautogui
from random import randint
import re
#TODO navigate browser to actURL

In [58]:
driver.get(actURL)
time.sleep(2)

sn = []
act = []

for i in range(len(ActLookupRequest)):
#while time.time() < t_end:# True :
    #driver = webdriver.Firefox(fp)
    #driver.get(SearchURL)MinnesotaMinneapolis
    screen =  np.array(ImageGrab.grab(bbox=(0,40,800,800)))
    #print('Frame took {} seconds'.format(time.time()-last_time))
    pyautogui.click(button='left', x=500, y=500)
    time.sleep(1)
    pyautogui.typewrite('Minnesota')
    time.sleep(1)
    pyautogui.press('enter')
    time.sleep(3)
    pyautogui.click(x=500, y=570, clicks=1, interval = .075, button='left')
    pyautogui.click(x=500, y=570, clicks=1, interval = .075, button='left')
    pyautogui.click(x=500, y=570, clicks=1, interval = .075, button='left')  
    #pyautogui.mouseDown(button='left', x=500, y=550)
    #pyautogui.mouseUp(button='left', x=500, y=550)
    time.sleep(2)

    pyautogui.typewrite(ActLookupRequest['Physical City'][i])   
    pyautogui.click(x=500, y=610, clicks=1, interval = .075, button='left')
    pyautogui.click(x=500, y=610, clicks=1, interval = .075, button='left')
    pyautogui.click(x=500, y=610, clicks=1, interval = .075, button='left')                
    #pyautogui.mouseDown(button='left', x=500, y=620)
    #pyautogui.mouseUp(button='left', x=500, y=620)
    time.sleep(2)
    pyautogui.typewrite(ActLookupRequest.SchoolName[i])
    time.sleep(2)
    #pyautogui.press('enter')
    pyautogui.click(button='left', x=600, y=700, interval = 1)
        
    
    time.sleep(5)
    try:
        xpath = driver.find_element_by_xpath(("//ul[@class='center-code']"))
        actcode = xpath.text
        import re
        actcode = re.sub("[^0-9]", "",actcode)
        act.append(actcode)
        sn.append(ActLookupRequest.SchoolName[i])
        driver.get(actURL)
    except:
        driver.get(actURL)
        continue
        
    #pyautogui.click(x=600, y=500, clicks=1, interval = 5, button='left')
    #pyautogui.typewrite('Minnesota')
    #new_screen = process_img(screen)Minnesota
    #cv2.imshow('window', #new_screen)javascript:void(0)
    #process_img(process_img(np0array(ImageGrab.grab(bbox=(0,40,800,640)))))
    #else:
    #    pass#cv2.destroyAllWindows()javascript:void(0)
    
    #x = randint(10, 798)
    #y = randint(10, 560)
    #pyautogui.mouseDown(button='left', x=x, y=y)
    #pyautogui.click(x=x, y=y, clicks=1, interval=1, button='left')

    #pyautogui.typewrite(SchoolLookUpCodes["Lookup Code"].iloc[i])

    
    #pyautogui.click(x=680, y=400, clicks=1, interval = 1, button='left')
    #driver.find_element_by_xpath(("//table[@class='search-results']/tbody/tr[2]/td"))
    #xpath = driver.find_element_by_xpath(("//table[@class='search-results']/tbody/tr[2]/td"))
    #text = xpath.text
    #AddressPhone.append(text)
    
    #pyautogui.click(x=400, y=400, clicks=1, interval = .075, button='left')
    #pyautogui.click(x=400, y=400, clicks=1, interval = .075, button='left')
    #pyautogui.click(x=400, y=400, clicks=1, interval = .075, button='left')
    #pyautogui.press('delete')
    
    if cv2.waitKey(25) & 0xFF == ord('q'):
        cv2.destroyAllWindows()
        break

In [57]:
driver.find_element_by_xpath(("//ul[@class='center-code']"))

<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="d057e7f1-e183-4645-8d70-d4731d84d7e2", element="607af8ff-4a46-4706-860b-aece7603142f")>

In [None]:
ActLookupRequest['Physical City'].head()



In [64]:
pd.Series(sc)

NameError: name 'sc' is not defined

In [None]:
ACT_CODE_DF = pd.D

In [62]:
len(act)

75

In [None]:
newpath = r'C:\MDE_Data\Enrollment_Help'
if not os.path.exists(newpath):
    os.makedirs(newpath)
    
os.chdir(newpath)
os.getcwd()

In [None]:
# Find all Enrollment by Ethnicity/Gender datasets
#TReport_href_Enrollment_by_EthnicityGender_Help = driver.find_element_by_xpath('//a[@href and @title = "Help file: Enrollment Field Descriptions"]')

In [None]:
TReport_href_Enrollment_by_EthnicityGender_Help = driver.find_element_by_xpath('//a[@href and @title = "Help file: Enrollment Field Descriptions"]')

In [None]:
TReport_href_Enrollment_by_EthnicityGender_Help.get_attribute('href')

In [None]:
link = TReport_href_Enrollment_by_EthnicityGender_Help.get_attribute('href')
title = TReport_href_Enrollment_by_EthnicityGender_Help.get_attribute('title')
wget.download(link, 'Enrollment_Gender_Ethnicity_Help.xlsx')

In [None]:
NonPublic = r'C:\MDE_Data\Enrollment_NonPublic'
if not os.path.exists(NonPublic):
    os.makedirs(NonPublic)
os.chdir(NonPublic)
os.getcwd()

In [None]:
TReport_href_Enrollment_Nonpublic = driver.find_elements_by_xpath('//a[@href and contains(@title,"Enrollment-Nonpublic")]')

In [None]:
 # All Enrollment by Ethnicity/Gender datasets
for ii in TReport_href_Enrollment_Nonpublic:
    wget.download(ii.get_attribute('href'), print(ii.get_attribute('title')))

In [None]:
for filename in os.listdir(os.getcwd()):
    print(filename)

In [None]:
print(os.getcwd())

In [None]:
# TODO add second part to year

In [None]:
os.chdir(EthnicityGender)

In [None]:
ThisYr = pd.read_excel("2017-2018 Enrollment by Ethnicity_Gender.xlsx", sheetname = 'School')

In [None]:
len(ThisYr)

In [None]:
ThisYr.columns

In [None]:
ThisYr = ThisYr[ThisYr.Grade == '11']

In [None]:
LookUp = pd.read_csv("School_06-05-2018.csv")

In [None]:
LookUp.columns

In [None]:
PreACT = pd.merge(ThisYr, LookUp, how = "inner"
                  , left_on = ['districtNumber', 'districtType', 'schoolNumber']
                  , right_on = ['District Number', 'District Type', 'School Number'])

In [None]:
PreACT

In [None]:
PreACT.columns

In [None]:
len(PreACT["ACT ID"].isnull() == True)

In [None]:
len(PreACT[PreACT["ACT ID"].isnull() == False])

In [None]:
ACTLookUpRequest = PreACT["ACT_ID"].isnull()

In [None]:
len(ThisYr[ThisYr.Grade == '11' and pd.isnull(ThisYr["ACT_CODE"])])

In [None]:
LookUp = pd.read_csv("School_06-05-2018.csv")

In [None]:
xl_workbook = xlrd.open_workbook("2017-2018 Enrollment by Ethnicity_Gender.xlsx")

In [None]:
os.getcwd()