In [53]:
import pandas as pd
import webbrowser
import os
import sys
import importlib

sys.path.insert(0, os.path.abspath('../../crawlers/search_engine_preprocess'))

import scraping_prep
importlib.reload(scraping_prep)

<module 'scraping_prep' from '/Users/jkim/Desktop/C4C/automated-schools-outreach-system/crawlers/search_engine_preprocess/scraping_prep.py'>

The View function is designed to open a new browser window and open the Pandas dataframe as a HTML table, replicating the View function in R Studio. There is custom CSS styling included to provide a straightforward view of the dataFrame.

In [12]:
def view_sample(df, sample_size):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    
    sample= df.sample(n=sample_size)
    html = sample.to_html(escape=False) + css
   
    path = 'temp.html'
    with open(path, 'w') as f:
        f.write(html)
    
    url = 'file://' + os.path.realpath(path) # Used to get the absolute path of the html page we've created
    webbrowser.open(url)
    
def view_all(df):
    css = """<style>
    table { border-collapse: collapse; border: 3px solid #eee; }
    table tr th:first-child { background-color: #eeeeee; color: #333; font-weight: bold }
    table thead th { background-color: #eee; color: #000; }
    tr, th, td { border: 1px solid #ccc; border-width: 1px 0 0 1px; border-collapse: collapse;
    padding: 3px; font-family: monospace; font-size: 10px }</style>
    """
    
    html = df.to_html(escape=False) + css
   
    path = 'temp.html'
    with open(path, 'w') as f:
        f.write(html)
    
    url = 'file://' + os.path.realpath(path) # Used to get the absolute path of the html page we've created
    webbrowser.open(url)

We are focusing on the National Center for Educational Studies' public elementary/secondary school universe survey in 2022-2023 and public school CCD geocode dataset. The datasets encaptures an extensive list of information of all the public schools in the United States. The goal is to extract the contact information and geolocation of all elementary and secondary schools across America to be readily accessible for users, ultimately providing the ability to reach out to nearby schools of any location in the US with corresponding email addresses.

In [13]:
ccd = pd.read_excel('../../data/ccd/ccd_public_school_directory.xlsm', engine='openpyxl')
gd = pd.read_excel('../../data/geocodes/public_school_geocodes.xlsx', engine='openpyxl')

ccd_docs_file = pd.ExcelFile('../../data/ccd/ccd_public_school_directory_companion.xlsx', engine='openpyxl')
ccd_docs = pd.read_excel(ccd_docs_file, sheet_name='File Layout')

clean_ccd = pd.read_csv('../../data/clean_datasets/clean_ccd.csv')
clean_ccd_docs = pd.read_csv('../../data/clean_datasets/clean_ccd_docs.csv')
clean_gd = pd.read_csv('../../data/clean_datasets/clean_gd.csv')
schools_information = pd.read_csv('../../data/clean_datasets/schools_information.csv')

schools_without_website = pd.read_csv('../../data/no_website_schools/schools_without_websites.csv')
search_engine_prep = pd.read_csv('../../data/no_website_schools/search_engine_prep.csv')


We are checking if the gd and id have the same number of schools in the dataset. They both have 102,268 rows, alluding that the dataset contains the same exact schools. This identicalness is perfect to join the datsets together for the most helpful variables to join together.

In [14]:
print(len(gd))
print(len(ccd))

102268
102268


91.1% of open schools are regular schools, while alternate schools (5.3%), special education schools (1.9%), and career & technical schools (1.6%) make up the rest of the school population.

In [15]:

school_type_counts = schools_information['SCH_TYPE_TEXT'].value_counts()
school_type_proportions = school_type_counts / school_type_counts.sum()

print("Counts of each school type:\n", school_type_counts)
print("\nProportions of each school type:\n", school_type_proportions)

Counts of each school type:
 SCH_TYPE_TEXT
Regular School                 91716
Alternative School              5353
Special Education School        1897
Career and Technical School     1605
Name: count, dtype: int64

Proportions of each school type:
 SCH_TYPE_TEXT
Regular School                 0.911953
Alternative School             0.053226
Special Education School       0.018862
Career and Technical School    0.015959
Name: count, dtype: float64


The overwhelming majority of open schools are regular schools (90869), which may be the most easily accessible audience compared to other types of schools. In second are alternative schools (5233), then special education schools (1848), and finally career and technical schools (1578).

In [16]:
school_type_status = schools_information.groupby(['UPDATED_STATUS_TEXT', 'SCH_TYPE_TEXT'])

school_type_status = school_type_status.size().reset_index(name='counts')

school_type_status['proportion'] = school_type_status['counts'] / school_type_status['counts'].sum()
school_type_status = school_type_status.sort_values(by = 'proportion')

print(school_type_status)

        UPDATED_STATUS_TEXT                SCH_TYPE_TEXT  counts  proportion
5   Changed Boundary/Agency  Career and Technical School       3    0.000030
1                     Added  Career and Technical School       4    0.000040
4   Changed Boundary/Agency           Alternative School       4    0.000040
15                 Reopened           Alternative School       7    0.000070
3                     Added     Special Education School       8    0.000080
16                 Reopened               Regular School      15    0.000149
0                     Added           Alternative School      16    0.000159
8                       New  Career and Technical School      20    0.000199
10                      New     Special Education School      41    0.000408
2                     Added               Regular School      46    0.000457
6   Changed Boundary/Agency               Regular School      50    0.000497
7                       New           Alternative School      93    0.000925

In [49]:
view_sample(schools_information, 50)

In [54]:
scraping_prep.csv_to_array_of_strings("../../data/no_website_schools/search_engine_prep.csv")

                                SCH_NAME                   LSTREET1  \
0              Albertville Middle School          600 E Alabama Ave   
1                Albertville High School           402 E McCord Ave   
2        Albertville Intermediate School         901 W McKinney Ave   
3          Albertville Elementary School         145 West End Drive   
4      Albertville Kindergarten and PreK        257 Country Club Rd   
...                                  ...                        ...   
66975         LOCKHART ELEMENTARY SCHOOL           41 ESTATE THOMAS   
66976    ULLA F MULLER ELEMENTARY SCHOOL          7B ESTATE CONTANT   
66977    YVONNE BOWSKY ELEMENTARY SCHOOL  15B and 16 ESTATE MANDAHL   
66978         CANCRYN JUNIOR HIGH SCHOOL                1 CROWN BAY   
66979       BERTHA BOSCHULTE JUNIOR HIGH         9 1 and 12A BOVONI   

              LCITY            STATENAME  
0       Albertville              ALABAMA  
1       Albertville              ALABAMA  
2       Albertvill

['Albertville Middle School website 600 E Alabama Ave Albertville ALABAMA',
 'Albertville High School website 402 E McCord Ave Albertville ALABAMA',
 'Albertville Intermediate School website 901 W McKinney Ave Albertville ALABAMA',
 'Albertville Elementary School website 145 West End Drive Albertville ALABAMA',
 'Albertville Kindergarten and PreK website 257 Country Club Rd Albertville ALABAMA',
 'Albertville Primary School website 1100 Horton Rd Albertville ALABAMA',
 'Asbury High School website 1990 Asbury Rd Albertville ALABAMA',
 'Claysville School website 140 Claysville School Rd Guntersville ALABAMA',
 'Douglas Elementary School website Highway 75 Douglas ALABAMA',
 'Douglas High School website 225 Eagle Drive Douglas ALABAMA',
 'Brindlee Mountain Elementary School website 2233 Shoal Creek Rd Arab ALABAMA',
 'Kate D Smith DAR High School website 6077 Main St Grant ALABAMA',
 'Brindlee Mountain Primary School website 1050 Scant City Road Guntersville ALABAMA',
 'Marshall Technical