In [1]:
import pandas as pd
from pathlib import Path

In [2]:
school_complete = Path("C:/Users/lhs70/OneDrive/Desktop/wk4_resource/schools_complete.csv")
student_complete = Path("C:/Users/lhs70/OneDrive/Desktop/wk4_resource/students_complete.csv")

school_df = pd.read_csv(school_complete)
student_df = pd.read_csv(student_complete)

school_df.head(2)

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,Government,2917,1910635
1,1,Figueroa High School,Government,2949,1884411


In [3]:
school_df["Budget per student"] = school_df["budget"]/school_df["size"]
school_df.head(2)

Unnamed: 0,School ID,school_name,type,size,budget,Budget per student
0,0,Huang High School,Government,2917,1910635,655.0
1,1,Figueroa High School,Government,2949,1884411,639.0


In [4]:
spending_bins = [0, 585, 630, 645, 680]
labels = ["<$585", "$585-630", "$630-645", "$645-680"]

# Jupyter asks me to use loc, because df is modified
school_df.loc[:, "Spending Ranges"] = pd.cut(school_df["Budget per student"], bins=spending_bins, labels=labels)
school_df.head(2)

Unnamed: 0,School ID,school_name,type,size,budget,Budget per student,Spending Ranges
0,0,Huang High School,Government,2917,1910635,655.0,$645-680
1,1,Figueroa High School,Government,2949,1884411,639.0,$630-645


In [5]:
spending_df = school_df[["school_name","Spending Ranges"]]
spending_df.head(2)

Unnamed: 0,school_name,Spending Ranges
0,Huang High School,$645-680
1,Figueroa High School,$630-645


In [6]:
student_df.head(2)

Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score
0,0,Paul Bradley,M,9,Huang High School,96,94
1,1,Victor Smith,M,12,Huang High School,90,43


In [7]:
school_spending_df = pd.merge(student_df, spending_df, on="school_name", how='left')
school_spending_df.head(2)

Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score,Spending Ranges
0,0,Paul Bradley,M,9,Huang High School,96,94,$645-680
1,1,Victor Smith,M,12,Huang High School,90,43,$645-680


In [8]:
spending_maths_scores = school_spending_df.groupby(["Spending Ranges"], observed=False)["maths_score"].mean()
spending_reading_scores = school_spending_df.groupby(["Spending Ranges"], observed=False)["reading_score"].mean()

spending_1 = pd.DataFrame({
    'Average Maths Score': spending_maths_scores,
    'Average Reading Score': spending_reading_scores
})

spending_1

Unnamed: 0_level_0,Average Maths Score,Average Reading Score
Spending Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1
<$585,70.938128,70.379397
$585-630,72.173448,70.965856
$630-645,70.104045,69.95363
$645-680,68.876878,69.06416


In [13]:
schools_summary = Path("C:/Users/lhs70/OneDrive/Desktop/wk4_resource/to_submit/schools_summary.csv")
summary_df = pd.read_csv(schools_summary)

summary_df.head(2)

Unnamed: 0,School ID,School name,School type,Total student,Total school budget,Per student budget,Average maths score,Average reading score,Passing Maths (%),Passing Reading (%),Overall Passing (%)
0,0,Huang High School,Government,2917,1910635,655,68.935207,68.910525,81.693521,81.453548,66.712376
1,1,Figueroa High School,Government,2949,1884411,639,68.698542,69.077993,81.654798,82.807731,67.650051


In [14]:
passing_df = summary_df[["School name","Passing Maths (%)", "Passing Reading (%)", "Overall Passing (%)"]]
passing_df.head(2)

Unnamed: 0,School name,Passing Maths (%),Passing Reading (%),Overall Passing (%)
0,Huang High School,81.693521,81.453548,66.712376
1,Figueroa High School,81.654798,82.807731,67.650051


In [22]:
passing_df = passing_df.copy()
passing_df.rename(columns={'School name': 'school_name'}, inplace=True)
passing_df.head(2)

Unnamed: 0,school_name,Passing Maths (%),Passing Reading (%),Overall Passing (%)
0,Huang High School,81.693521,81.453548,66.712376
1,Figueroa High School,81.654798,82.807731,67.650051


In [23]:
school_spending2_df = pd.merge(passing_df, spending_df, on="school_name", how='left')
school_spending2_df.head(2)

Unnamed: 0,school_name,Passing Maths (%),Passing Reading (%),Overall Passing (%),Spending Ranges
0,Huang High School,81.693521,81.453548,66.712376,$645-680
1,Figueroa High School,81.654798,82.807731,67.650051,$630-645


In [24]:
spending_passing_maths = school_spending2_df.groupby(["Spending Ranges"], observed=False)["Passing Maths (%)"].mean()
spending_passing_reading = school_spending2_df.groupby(["Spending Ranges"], observed=False)["Passing Reading (%)"].mean()
spending_passing_overall = school_spending2_df.groupby(["Spending Ranges"], observed=False)["Overall Passing (%)"].mean()

spending_2 = pd.DataFrame({
    '% passing maths': spending_passing_maths,
    '% passing reading': spending_passing_reading,
    '% overall passing': spending_passing_overall
})

spending_2

Unnamed: 0_level_0,% passing maths,% passing reading,% overall passing
Spending Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<$585,88.835926,86.390517,76.721458
$585-630,91.518824,87.292423,79.876293
$630-645,84.686139,83.763585,71.004977
$645-680,81.56847,81.769716,66.756253


In [26]:
spending_summary = pd.merge(spending_1, spending_2, on="Spending Ranges", how='outer')
spending_summary

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% passing maths,% passing reading,% overall passing
Spending Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,70.938128,70.379397,88.835926,86.390517,76.721458
$585-630,72.173448,70.965856,91.518824,87.292423,79.876293
$630-645,70.104045,69.95363,84.686139,83.763585,71.004977
$645-680,68.876878,69.06416,81.56847,81.769716,66.756253


In [31]:
spending_summary.to_csv("C:/Users/lhs70/OneDrive/Desktop/wk4_resource/to_submit/spending_summary.csv", encoding="utf-8", index=True, header=True)
