In [1]:
import pandas as pd
from pathlib import Path

In [2]:
school_complete = Path("C:/Users/lhs70/OneDrive/Desktop/wk4_resource/schools_complete.csv")
student_complete = Path("C:/Users/lhs70/OneDrive/Desktop/wk4_resource/students_complete.csv")

school_df = pd.read_csv(school_complete)
student_df = pd.read_csv(student_complete)

school_df.head(2)

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,Government,2917,1910635
1,1,Figueroa High School,Government,2949,1884411


In [4]:
size_bins = [0, 1000, 2000, 5000]
labels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

school_df["Size Ranges"] = pd.cut(school_df["size"], bins=size_bins, labels=labels)
school_df.head(2)

Unnamed: 0,School ID,school_name,type,size,budget,Size Ranges
0,0,Huang High School,Government,2917,1910635,Large (2000-5000)
1,1,Figueroa High School,Government,2949,1884411,Large (2000-5000)


In [5]:
size_df = school_df[["school_name","Size Ranges"]]
size_df.head(2)

Unnamed: 0,school_name,Size Ranges
0,Huang High School,Large (2000-5000)
1,Figueroa High School,Large (2000-5000)


In [6]:
student_df.head(2)

Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score
0,0,Paul Bradley,M,9,Huang High School,96,94
1,1,Victor Smith,M,12,Huang High School,90,43


In [7]:
school_size_df = pd.merge(student_df, size_df, on="school_name", how='left')
school_size_df.head(2)

Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score,Size Ranges
0,0,Paul Bradley,M,9,Huang High School,96,94,Large (2000-5000)
1,1,Victor Smith,M,12,Huang High School,90,43,Large (2000-5000)


In [8]:
size_maths_scores = school_size_df.groupby(["Size Ranges"], observed=False)["maths_score"].mean()
size_reading_scores = school_size_df.groupby(["Size Ranges"], observed=False)["reading_score"].mean()

size_1 = pd.DataFrame({
    'Average Maths Score': size_maths_scores,
    'Average Reading Score': size_reading_scores
})

size_1

Unnamed: 0_level_0,Average Maths Score,Average Reading Score
Size Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1
Small (<1000),72.240461,71.62779
Medium (1000-2000),71.441798,70.724595
Large (2000-5000),69.92645,69.685088


In [9]:
schools_summary = Path("C:/Users/lhs70/OneDrive/Desktop/wk4_resource/to_submit/schools_summary.csv")
summary_df = pd.read_csv(schools_summary)

summary_df.head(2)

Unnamed: 0,School ID,School name,School type,Total student,Total school budget,Per student budget,Average maths score,Average reading score,Passing Maths (%),Passing Reading (%),Overall Passing (%)
0,0,Huang High School,Government,2917,1910635,655,68.935207,68.910525,81.693521,81.453548,66.712376
1,1,Figueroa High School,Government,2949,1884411,639,68.698542,69.077993,81.654798,82.807731,67.650051


In [10]:
passing_df = summary_df[["School name","Passing Maths (%)", "Passing Reading (%)", "Overall Passing (%)"]]
passing_df.head(2)

Unnamed: 0,School name,Passing Maths (%),Passing Reading (%),Overall Passing (%)
0,Huang High School,81.693521,81.453548,66.712376
1,Figueroa High School,81.654798,82.807731,67.650051


In [11]:
passing_df = passing_df.copy()
passing_df.rename(columns={'School name': 'school_name'}, inplace=True)
passing_df.head(2)

Unnamed: 0,school_name,Passing Maths (%),Passing Reading (%),Overall Passing (%)
0,Huang High School,81.693521,81.453548,66.712376
1,Figueroa High School,81.654798,82.807731,67.650051


In [12]:
school_size2_df = pd.merge(passing_df, size_df, on="school_name", how='left')
school_size2_df.head(2)

Unnamed: 0,school_name,Passing Maths (%),Passing Reading (%),Overall Passing (%),Size Ranges
0,Huang High School,81.693521,81.453548,66.712376,Large (2000-5000)
1,Figueroa High School,81.654798,82.807731,67.650051,Large (2000-5000)


In [14]:
size_passing_maths = school_size2_df.groupby(["Size Ranges"], observed=False)["Passing Maths (%)"].mean()
size_passing_reading = school_size2_df.groupby(["Size Ranges"], observed=False)["Passing Reading (%)"].mean()
size_passing_overall = school_size2_df.groupby(["Size Ranges"], observed=False)["Overall Passing (%)"].mean()

size_2 = pd.DataFrame({
    '% passing maths': size_passing_maths,
    '% passing reading': size_passing_reading,
    '% overall passing': size_passing_overall
})

size_2

Unnamed: 0_level_0,% passing maths,% passing reading,% overall passing
Size Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Small (<1000),90.806867,87.557513,79.066348
Medium (1000-2000),89.84656,86.714149,78.039785
Large (2000-5000),84.252804,83.301185,70.293507


In [15]:
size_summary = pd.merge(size_1, size_2, on="Size Ranges", how='outer')
size_summary

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% passing maths,% passing reading,% overall passing
Size Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),72.240461,71.62779,90.806867,87.557513,79.066348
Medium (1000-2000),71.441798,70.724595,89.84656,86.714149,78.039785
Large (2000-5000),69.92645,69.685088,84.252804,83.301185,70.293507


In [16]:
size_summary.to_csv("C:/Users/lhs70/OneDrive/Desktop/wk4_resource/to_submit/size_summary.csv", encoding="utf-8", index=True, header=True)