In [15]:
# Dependencies:
import pandas as pd


In [16]:
## open the school demographic data

df_public = pd.read_csv('data\MHSAA\school_info\school_info_PUBLIC.csv')
df_private = pd.read_csv('data\MHSAA\school_info\school_info_PRIVATE.csv')

## Add a column to each dataframe that indicates the type of school
df_public['school_type'] = 'public'
df_private['school_type'] = 'private'

In [17]:
## Transform the dataframes to have the same column names
# df_public.info()

# df_private.info()

In [18]:
# Transform the Private school data to more readable format

# Create a dictionary mapping locale codes to descriptions
locale_dict = {
    11: 'City: Large',
    12: 'City: Midsize',
    13: 'City: Small',
    21: 'Suburb: Large',
    22: 'Suburb: Midsize',
    23: 'Suburb: Small',
    31: 'Town: Fringe',
    32: 'Town: Distant',
    33: 'Town: Remote',
    41: 'Rural: Fringe',
    42: 'Rural: Distant',
    43: 'Rural: Remote'
}

relig_dict = {
    1: 'Catholic',
    2: 'Other Religious',
    3: 'Nonsectarian'

}

ort_dict = {
    1: 'Roman Catholic',
    2: 'African Methodist Episcopal',
    3: 'Amish',
    4: 'Assembly of God',
    5: 'Baptist',
    6: 'Brethren',
    7: 'Calvinist',
    8: 'Christian (no denomination specified)',
    9: 'Church of Christ',
    10: 'Church of God',
    11: 'Church of God in Christ',
    12: 'Church of the Nazarene',
    13: 'Disciples of Christ',
    14: 'Episcopal',
    15: 'Friends',
    16: 'Greek Orthodox',
    17: 'Islamic',
    18: 'Jewish',
    19: 'Latter Day Saints',
    20: 'Lutheran Church – Missouri Synod',
    21: 'Evangelical Lutheran Church in America',
    22: 'Wisconsin Evangelical Lutheran Synod',
    23: 'Other Lutheran',
    24: 'Mennonite',
    25: 'Methodist',
    26: 'Penecostal',
    27: 'Presbyterian',
    28: 'Seventh Day Adventists',
    29: 'Other',
    30: 'Nonsectarians'
}

## Change PSS_COMM_TYPE to a more readable format
# This is an higher level classification of Locale

# Create a dictionary mapping locale codes to descriptions
comm_dict = {
    1: 'City',
    2: 'Suburban',
    3: 'Town',
    4: 'Rural'
}

# Dictionary for COOED values
coed_dict = {
    1: 'Coed',
    2: 'Female_Only',
    3: 'Male_Only'
}


# Apply the mapping to the columns
df_private['PSS_COED'] = df_private['PSS_COED'].map(coed_dict)
df_private['PSS_LOCALE'] = df_private['PSS_LOCALE'].map(locale_dict)
df_private['PSS_COMM_TYPE'] = df_private['PSS_COMM_TYPE'].map(comm_dict)
df_private['PSS_ORIENT'] = df_private['PSS_ORIENT'].map(ort_dict)
df_private['PSS_RELIG'] = df_private['PSS_RELIG'].map(relig_dict)


# Calculate the High School Enrollment (Grades 9-12)
cols_to_sum = ['PSS_ENROLL_9', 'PSS_ENROLL_10', 'PSS_ENROLL_11', 'PSS_ENROLL_12', 'PSS_ENROLL_UG']
df_private['HS_ENROLL'] = df_private[cols_to_sum].fillna(0).sum(axis=1)

In [19]:
# Renaming columns in df_private to match df_public
df_private = df_private.rename(columns={
    'PSS_INST': 'School Name',
    'PSS_ADDRESS': 'Street Address',
    'PSS_CITY': 'City',
    'PSS_PHONE': 'Phone',
    'PSS_ZIP5': 'ZIP',
    'PSS_ZIP4': 'ZIP 4-digit',
    'PSS_COUNTY_NAME': 'County Name',
    'PSS_COUNTY_FIPS': 'County FIPS',
    'PSSCOUNTY_NO': 'County Number',
    'PSS_STABB': 'State',
    'PSS_ORIENT': 'Relig_subtype',
    'PSS_RELIG': 'Relig',
    'PSS_LOCALE': 'Locale',
    'PSS_COMM_TYPE': 'Community_Type',
    'PSS_INDIAN_PCT': 'demo_pct_American Indian',
    'PSS_ASIAN_PCT': 'demo_pct_Asian',
    'PSS_HISP_PCT': 'demo_pct_Hispanic',
    'PSS_BLACK_PCT': 'demo_pct_Black',
    'PSS_WHITE_PCT': 'demo_pct_White',
    'PSS_PACISL_PCT': 'demo_pct_Pacific Islander',
    'PSS_TWOMORE_PCT': 'demo_pct_Two_or_More',

    'PSS_COED': 'Coed',

    'PSS_STDTCH_RT': 'Student Teacher Ratio',
    'PSS_SCHOOL_ID' : 'School ID(private)',
   
})

# Drop the columns that are not needed
drop_col = ['LoGrade','HiGrade','PSS_RACE_AI','PSS_RACE_AS','PSS_RACE_H',
            'PSS_RACE_B','PSS_RACE_W','PSS_RACE_P','PSS_RACE_2','PSS_TYPE',
            'PSS_LEVEL','PSS_ASSOC_2','PSS_ASSOC_3','PSS_FIPS','PSS_SCH_DAYS',
            'PSS_STU_DAY_HRS','PSS_LIBRARY','PSS_ENROLL_UG','PSS_ENROLL_PK',
            'PSS_ENROLL_K','PSS_ENROLL_1','PSS_ENROLL_2','PSS_ENROLL_3','PSS_ENROLL_4',
            'PSS_ENROLL_5','PSS_ENROLL_6','PSS_ENROLL_7','PSS_ENROLL_8','PSS_ENROLL_9',
            'PSS_ENROLL_10','PSS_ENROLL_11','PSS_ENROLL_12','PSS_ENROLL_T','PSS_ENROLL_TK12',
            'PSS_FTE_TEACH'

            ]
# print(df_private.columns)


# Creat new columns from the list and Fill the following columns with 'No'


fill_col = ['Charter','Magnet','Title I School','Title 1 School Wide']

for col in fill_col:
    df_private[col] = 'No'

# Drop the columns that are not needed
df_private = df_private.drop(columns=drop_col)

In [20]:
## Make some changes to the public school data

# Rename columns to match private school data
# Rename Students to HS_ENROLL

df_public = df_public.rename(columns={'Students': 'HS_ENROLL'})

# Create and fill some columns that are from the private school data
df_public = df_public.assign(
    **{'Coed': 'Yes',
    'Relig': 'None',
    'Community_Type': df_public['Locale'].str.split(':').str[0],
    'Relig_subtype': 'None'}
)


In [21]:
## Lookup The demographic data for the public schools before merging

### PARSE PUBLIC SCHOOL DEMO DATA FROM NCES WEBSITE

# import pandas as pd
# import requests
# from bs4 import BeautifulSoup
# from tqdm import tqdm
# import os
# from IPython.display import clear_output

# import time

# def parse_school_data(school_ids):
#     # Start with an empty DataFrame to store the results
#     df = pd.DataFrame()

#     # We use tqdm to create a progress bar
#     for school_id in tqdm(school_ids):

        
#         # Build the URL for this school
#         url = f"https://nces.ed.gov/ccd/schoolsearch/school_detail.asp?Search=1&SchoolID={school_id}&ID={school_id}&SchoolType=1&SchoolType=2&SchoolType=3&SchoolType=4&SpecificSchlTypes=all&IncGrade=-1&LoGrade=-1&HiGrade=-1&ID2={school_id}"

#         print(f"Processing school: {school_id}")

#         ## Parse with pandas
#         # Read the tables on the page into a list of DataFrames
#         df_test = pd.read_html(url)

#         # Process and print statements for debugging
#         # print("Parsed html, extracting information...")

#         # Get the value in the first column of the first row and drop everything in from of the \t
#         school_name = df_test[2].iloc[0,0].split('\t')[1]
#         school_id = df_test[2].iloc[0,2].split('ID:')[1].strip()
#         school_district = df_test[2].iloc[2,0].split('\t')[1].split('district information')[0]
#         school_district_id = df_test[2].iloc[2,2].split('ID:')[1].strip()

#         #Get Total Students
#         total_students = df_test[6].iloc[0,2]
#         # Get Teachers at the school
#         teachers = df_test[6].iloc[1,2]
#         # Get the student/teacher ratio
#         student_teacher_ratio = df_test[6].iloc[2,2]

#         # Get all the grade levels and counts
#         grade_levels = df_test[10].iloc[0,1:7]
#         grade_counts = df_test[10].iloc[1,1:7]
#         grade_dict = dict(zip(grade_levels, grade_counts))

#         # Get free and reduced lunch counts
#         lunch_free_count = df_test[16].iloc[0,0].split(':')[1].strip()
#         lunch_reduced_count = df_test[16].iloc[0,1].split(':')[1].strip()
#         lunch_total_count = df_test[16].iloc[0,2].split(':')[1].strip()

#         # Get the demographic categories and counts
#         demo_cat = df_test[12].iloc[0,1:8]
#         demo_counts = df_test[12].iloc[1,1:8]
#         demo_dict = dict(zip(demo_cat, demo_counts))

#         # Get the gender categories and counts
#         gender_cat = df_test[14].iloc[0,1:3]
#         gender_counts = df_test[14].iloc[1,1:3]
#         gender_dict = dict(zip(gender_cat, gender_counts))

#         print("Information extracted, saving to dataframe...")

#         # Save the parsed data to a row in a dataframe
#         df_test_row = pd.DataFrame({'school_name': school_name,
#         'school_id': school_id,
#         'school_district': school_district,
#         'school_district_id': school_district_id,
#         'total_students': total_students,
#         'teachers': teachers,
#         'student_teacher_ratio': student_teacher_ratio,
#         'lunch_free_count': lunch_free_count,
#         'lunch_reduced_count': lunch_reduced_count,
#         'lunch_total_count': lunch_total_count,
#         **grade_dict,
#         **demo_dict,
#         **gender_dict
#         }, index=[0])

#         # Append the new row to our DataFrame
#         df = pd.concat([df, df_test_row], ignore_index=True)

#         # add a delay to prevent getting blocked
#         time.sleep(1)

#         # print(f"Finished processing school: {school_id}")
        
#         # Clear the output
#         # clear_output(wait=True)

#     return df



In [22]:
## Get the list of NCES id numbers and parse the data

## Get a list of the school Ids for all the public schools
nces_ids_list = df_public['NCES School ID'].tolist()




test = nces_ids_list[0:5]

In [9]:
df_public.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1314 entries, 0 to 1313
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   School Name            1314 non-null   object 
 1   NCES School ID         1314 non-null   int64  
 2   County Name            1314 non-null   object 
 3   Street Address         1314 non-null   object 
 4   City                   1314 non-null   object 
 5   State                  1314 non-null   object 
 6   ZIP                    1314 non-null   int64  
 7   ZIP 4-digit            1171 non-null   float64
 8   Phone                  1314 non-null   object 
 9   Locale Code            1314 non-null   int64  
 10  Locale                 1314 non-null   object 
 11  Charter                1314 non-null   object 
 12  Magnet                 1314 non-null   object 
 13  Title I School         1314 non-null   object 
 14  Title 1 School Wide    1314 non-null   object 
 15  HS_E

In [12]:
### Feed the list of school ids into the function
# df = df_public 


# df = parse_school_data(nces_ids_list)

# print(df.head())

In [13]:
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1314 entries, 0 to 1313
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   School Name            1314 non-null   object 
 1   NCES School ID         1314 non-null   int64  
 2   County Name            1314 non-null   object 
 3   Street Address         1314 non-null   object 
 4   City                   1314 non-null   object 
 5   State                  1314 non-null   object 
 6   ZIP                    1314 non-null   int64  
 7   ZIP 4-digit            1171 non-null   float64
 8   Phone                  1314 non-null   object 
 9   Locale Code            1314 non-null   int64  
 10  Locale                 1314 non-null   object 
 11  Charter                1314 non-null   object 
 12  Magnet                 1314 non-null   object 
 13  Title I School         1314 non-null   object 
 14  Title 1 School Wide    1314 non-null   object 
 15  HS_E

In [23]:
## Output csv to check the data
# df.to_csv('test_nces_parse.csv', index=False)
# df.dtypes

df_public.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1314 entries, 0 to 1313
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   School Name            1314 non-null   object 
 1   NCES School ID         1314 non-null   int64  
 2   County Name            1314 non-null   object 
 3   Street Address         1314 non-null   object 
 4   City                   1314 non-null   object 
 5   State                  1314 non-null   object 
 6   ZIP                    1314 non-null   int64  
 7   ZIP 4-digit            1171 non-null   float64
 8   Phone                  1314 non-null   object 
 9   Locale Code            1314 non-null   int64  
 10  Locale                 1314 non-null   object 
 11  Charter                1314 non-null   object 
 12  Magnet                 1314 non-null   object 
 13  Title I School         1314 non-null   object 
 14  Title 1 School Wide    1314 non-null   object 
 15  HS_E

In [14]:
## Claculate the percentages for the demographic data so it matches with the private school data

# Create a list of the columns to calculate percentages for
demo_list = ['American Indian/Alaska Native', 'Asian', 'Black', 
             'Hispanic', 'White', 'Native Hawaiian/Pacific Islander', 
             'Two or MoreRaces']

# calculate and store the percentages
for col in demo_list:
    df[col] = df[col].astype(float)
    df[col] = df[col] / df['total_students'].astype(float)


KeyError: 'American Indian/Alaska Native'

In [None]:
# df.head()

In [None]:
## Calc male and femal percentages

df['Male'] = df['Male'] / df['total_students']
df['Female'] = df['Female'] / df['total_students']

In [None]:
### clean the data to only the high school level (grades 9-12)

## Add the values in the 9 10 11 12 columns to get HS_ENROLL by column position
df['HS_ENROLL'] = df.iloc[:, 9:13].sum(axis=1)

# drop any rows that have a 0 in the HS_ENROLL column
df = df[df['HS_ENROLL'] != 0]

df.head()
df.info()




In [None]:
## Save the data to a csv file
df.to_csv('public_school_demo_data.csv', index=False)

In [None]:
## df_public is the dataframe with the public school data

## df is the dataframe with the parsed demographic data

## Merge the two dataframes on the NCES School ID

df_merged = pd.merge(df_public, df, how='left', left_on='NCES School ID', right_on='school_id')

df_merged.head()

df_public = df_merged


In [None]:
### get a list of the school names from the public school data and the private school data

# public_school_names = df_public['School Name'].tolist()
# private_school_names = df_private['School Name'].tolist()