In [1]:
import pandas as pd

In [5]:
# Load the Excel file to inspect its contents
file_path = 'data/raw/2023SeniorSecondaryCompletionAndAchievementInformation.xlsx'
excel_data = pd.ExcelFile(file_path)
excel_data

<pandas.io.excel._base.ExcelFile at 0x7f368e448490>

In [8]:
# Load the relevant sheet into a dataframe, skipping the metadata rows
school_df = pd.read_excel(file_path, sheet_name='postcomp_for_publication', skiprows=8)

# Set the second row (index 1) as the actual column headers
school_df.columns = school_df.iloc[1]  # Set the second row as the headers
school_df = school_df.drop([0, 1])  # Drop the first two rows which contain redundant information

# Dropping any fully empty rows
school_df = school_df.dropna(how='all')

# Resetting the index to clean the dataframe
school_df = school_df.reset_index(drop=True)
school_df

1,School,Small School,Locality,Number of VCE and VCE Vocational Major (VM) studies at Units 3 and 4 level with enrolments,Number of Vocational Education and Training (VET) certificates with enrolments,Number of Higher Education studies (HES) with enrolments,Enrolment(s) in VCE Vocational Major (VM) unit(s)​,Enrolment(s) in the Victorian Pathways Certificate (VPC),Enrolment(s) in the International Baccalaureate (IB) Diploma,Enrolment(s) in Northern Hemisphere Timetable (NHT) delivery at Units 3 and 4 level ​,...,Number of students enrolled in the Victorian Certificate of Applied Learning (VCAL) at Intermediate level (2023 only),Percentage of satisfactory VCE completions,Number of students awarded the VCE Vocational Major (VM),Number of students awarded the VCE (Baccalaureate),Percentage of Vocational Education and Training (VET) units of competency completed,Percentage of Higher Education studies (HES) completed,Median VCE study score,Percentage of study scores of 40 and over,Percentage of Victorian Certificate of Applied Learning (VCAL) units completed (2023 only),Number of students awarded the Victorian Pathways Certificate (VPC)
0,Academy of Mary Immaculate,,FITZROY,33,10,-,,,,,...,-,99,-,8,98,-,31,10.3,-,-
1,Adass Israel School,,ELSTERNWICK,9,3,-,Y,,,,...,< 4,100,18,-,82,-,-,-,I/D,-
2,Adass Israel School,*,EAST ST KILDA,2,2,-,,Y,,,...,< 4,-,-,-,76,-,-,-,I/D,-
3,Advance College of Education,*,ROSEBUD WEST,13,19,-,Y,Y,,,...,-,75,< 4,-,59,-,-,-,-,15
4,Aitken College,,GREENVALE,41,11,2,Y,,,,...,-,99,5,< 4,100,I/D,28,5.6,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,Yarrawonga College P-12,,YARRAWONGA,30,21,6,Y,,,,...,< 4,98,15,-,92,I/D,26,3.7,I/D,-
594,Yea High School,,YEA,33,12,-,Y,Y,,,...,-,85,4,-,73,-,25,-,-,-
595,Yeshivah College,,ST KILDA EAST,14,2,-,,,,,...,-,100,-,-,91,-,34,17.9,-,-
596,Yesodei HaTorah College,,ORMOND,11,4,-,,,,,...,-,100,-,-,95,-,37,20,-,-


In [9]:
school_df.to_csv("data/curated/vcaa_results.csv")

# VCAA

In [10]:
'''
file_path = 'data/curated/vcaa_results.csv'
school_df = pd.read_csv(file_path)
school_df.head()
'''

"\nfile_path = 'data/curated/vcaa_results.csv'\nschool_df = pd.read_csv(file_path)\nschool_df.head()\n"

In [11]:
# Extract relevant columns: School, Locality (suburb), and Percentage of study scores of 40 and over
extracted_data = school_df[['School', 'Locality', 'Percentage of study scores of 40 and over']]

# Renaming the columns to make them shorter and more readable
school_df = extracted_data.rename(columns={
    'School': 'school',
    'Locality': 'suburb',
    'Percentage of study scores of 40 and over': 'pct_scores_40_over'
})

# Handling missing or invalid data in 'Pct_Scores_40_Over' by replacing '-' with NaN
school_df['pct_scores_40_over'] = pd.to_numeric(school_df['pct_scores_40_over'], errors='coerce')


In [12]:
school_df

1,school,suburb,pct_scores_40_over
0,Academy of Mary Immaculate,FITZROY,10.3
1,Adass Israel School,ELSTERNWICK,
2,Adass Israel School,EAST ST KILDA,
3,Advance College of Education,ROSEBUD WEST,
4,Aitken College,GREENVALE,5.6
...,...,...,...
593,Yarrawonga College P-12,YARRAWONGA,3.7
594,Yea High School,YEA,
595,Yeshivah College,ST KILDA EAST,17.9
596,Yesodei HaTorah College,ORMOND,20.0


In [16]:
school_df['school'] = school_df['school'].str.lower()
school_df['suburb'] = school_df['suburb'].str.lower()

# Sorting the data by 'Pct_Scores_40_Over' in descending order and creating a rank column
school_df = school_df.sort_values(by='pct_scores_40_over', ascending=False).reset_index(drop=True)
school_df['rank'] = school_df['pct_scores_40_over'].rank(method='min', ascending=False)
school_df

1,school,suburb,pct_scores_40_over,rank
0,ballarat clarendon college,ballarat,45.8,1.0
1,bialik college,hawthorn,34.1,2.0
2,huntingtower school,mount waverley,33.8,3.0
3,mount scopus memorial college,burwood,31.5,4.0
4,ruyton girls' school,kew,31.4,5.0
...,...,...,...,...
593,wyndham comm & educ centre,werribee,,
594,yarra hills sec coll mt evelyn,mount evelyn,,
595,yarra hills sec coll mooroolbark,mooroolbark,,
596,yea high school,yea,,


In [29]:
# Define thresholds for "Good", "Mid", and "Bad" schools based on the pct_scores_40_over
def categorize_school(pct_scores_40_over):
    if pct_scores_40_over > 15:
        return 'high'
    elif pct_scores_40_over >= 10:
        return 'good'
    elif pct_scores_40_over >= 4.5:
        return 'mid'
    else:
        return 'low'

# Apply the categorization function to create a new column 'school_quality'
school_df['school_quality'] = school_df['pct_scores_40_over'].apply(categorize_school)


In [30]:
school_df

1,school,suburb,pct_scores_40_over,rank,school_quality
0,ballarat clarendon college,ballarat,45.8,1.0,high
1,bialik college,hawthorn,34.1,2.0,high
2,huntingtower school,mount waverley,33.8,3.0,high
3,mount scopus memorial college,burwood,31.5,4.0,high
4,ruyton girls' school,kew,31.4,5.0,high
...,...,...,...,...,...
593,wyndham comm & educ centre,werribee,,,low
594,yarra hills sec coll mt evelyn,mount evelyn,,,low
595,yarra hills sec coll mooroolbark,mooroolbark,,,low
596,yea high school,yea,,,low


In [31]:
school_df.to_csv("data/curated/vcaa_ranking.csv")

In [2]:
file_path = 'data/curated/vcaa_ranking.csv'
school_df = pd.read_csv(file_path)
school_df.head()

Unnamed: 0.1,Unnamed: 0,school,suburb,pct_scores_40_over,rank,school_quality
0,0,ballarat clarendon college,ballarat,45.8,1.0,high
1,1,bialik college,hawthorn,34.1,2.0,high
2,2,huntingtower school,mount waverley,33.8,3.0,high
3,3,mount scopus memorial college,burwood,31.5,4.0,high
4,4,ruyton girls' school,kew,31.4,5.0,high


In [3]:
aggregated_data = school_df.groupby(['suburb', 'school_quality']).size().unstack(fill_value=0)
aggregated_data

school_quality,good,high,low,mid
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aberfeldie,0,0,0,1
albert park,0,0,0,1
alexandra,0,0,1,0
alphington,0,1,0,0
altona,0,0,1,1
...,...,...,...,...
wyndham vale,0,0,1,0
yarra junction,0,0,2,1
yarram,0,0,1,0
yarrawonga,0,0,2,0


In [7]:
aggregated_data = aggregated_data[['high', 'good', 'mid']]
aggregated_data

school_quality,high,good,mid
suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aberfeldie,0,0,1
albert park,0,0,1
alexandra,0,0,0
alphington,1,0,0
altona,0,0,1
...,...,...,...
wyndham vale,0,0,0
yarra junction,0,0,1
yarram,0,0,0
yarrawonga,0,0,0


In [8]:
aggregated_data.to_csv("data/curated/num_good_schools.csv")

# Merge with schools by suburb

In [25]:
file_path = 'data/curated/num_good_schools.csv'
school_quality_df = pd.read_csv(file_path)
school_quality_df.head()

Unnamed: 0,suburb,high,good,mid
0,aberfeldie,0,0,1
1,albert park,0,0,1
2,alexandra,0,0,0
3,alphington,1,0,0
4,altona,0,0,1


In [26]:
file_path = 'data/curated/schools_by_suburb.csv'
school_type_df = pd.read_csv(file_path)
school_type_df.head()

Unnamed: 0.1,Unnamed: 0,Address_Town,num_primary,num_secondary_public,num_secondary_private,num_secondary_catholic,num_special
0,0,Abbotsford,2,0,1,0,0
1,1,Aberfeldie,1,0,0,1,0
2,2,Aintree,1,0,0,0,0
3,3,Aireys Inlet,1,0,0,0,0
4,4,Airly,1,0,0,0,0


In [27]:
school_type_df['Address_Town'] = school_type_df['Address_Town'].str.lower()
merged_df = school_quality_df.merge(school_type_df, left_on='suburb', right_on='Address_Town', how='inner')
merged_df.head()

Unnamed: 0.1,suburb,high,good,mid,Unnamed: 0,Address_Town,num_primary,num_secondary_public,num_secondary_private,num_secondary_catholic,num_special
0,aberfeldie,0,0,1,1,aberfeldie,1,0,0,1,0
1,albert park,0,0,1,7,albert park,2,1,0,0,0
2,alexandra,0,0,0,10,alexandra,2,1,0,0,0
3,alphington,1,0,0,13,alphington,3,0,1,0,0
4,altona,0,0,1,14,altona,3,1,0,1,0


In [28]:
# Dropping the 'Unnamed: 0' and 'Address_Town' columns
merged_df = merged_df.drop(columns=['Unnamed: 0', 'Address_Town'])

# Renaming the columns 'high', 'good', and 'mid'
merged_df = merged_df.rename(columns={
    'high': 'v_high_vcaa',
    'good': 'high_vcaa',
    'mid': 'good_vcaa',
    'num_special': 'num_edu_centre'
})

# Displaying the first few rows to confirm changes
merged_df.head()

Unnamed: 0,suburb,v_high_vcaa,high_vcaa,good_vcaa,num_primary,num_secondary_public,num_secondary_private,num_secondary_catholic,num_edu_centre
0,aberfeldie,0,0,1,1,0,0,1,0
1,albert park,0,0,1,2,1,0,0,0
2,alexandra,0,0,0,2,1,0,0,0
3,alphington,1,0,0,3,0,1,0,0
4,altona,0,0,1,3,1,0,1,0


VCAA stands for Victorian Curriculum and Assessment Authority
v_high_vcaa means number of schools that get very high study score (VCE exams)

In [29]:
merged_df.to_csv("data/curated/schools_preprocessed.csv")