### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [404]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [405]:
# Check df for columns and data 
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [406]:
# Calcuate the number total number of schools
total_schools = len(school_data_complete['School ID'].unique())
# total_schools

# calcuate the number of students
student_count = len(school_data_complete["Student ID"].unique())
# student_count

# Calculate total Budget
total_budget = school_data_complete['budget'].sum()
# total_budget

# Calculate average math score
avg_math_score = school_data_complete['math_score'].mean()
# avg_math_score
# Calcuate average reading score
avg_read_score = school_data_complete['reading_score'].mean()
# avg_read_score



In [407]:
# count math score greater than or equal to 70
math_pass_count = len(school_data_complete[school_data_complete.math_score >=70])
# math_pass_count

# percent of students with math pass score
math_pass_percent = (math_pass_count/student_count)*100

# math_pass_percent


In [408]:
# count math score greater than or equal to 70
read_pass_count = len(school_data_complete[school_data_complete.reading_score >=70])
# read_pass_count

# percent of students with math pass score
read_pass_percent = (read_pass_count/student_count)*100
# read_pass_percent

# Overall pass rate in both reading and math percentage
overall_pass_rate = (math_pass_percent + read_pass_percent)/2

# overall_pass_rate


In [409]:
#create dataframe summary
district_sum = pd.DataFrame({
    'Number of Schools':[total_schools],
    'Number of Students':[student_count],
    'Total Budget $':[total_budget],
    'Avg Math Score':[avg_math_score],
    'Avg read Score':[avg_read_score],
    'Math Pass %':[math_pass_percent],
    "Read Pass %":[read_pass_percent],
    "Overall pass %":[overall_pass_rate],    
})
district_sum

Unnamed: 0,Number of Schools,Number of Students,Total Budget $,Avg Math Score,Avg read Score,Math Pass %,Read Pass %,Overall pass %
0,15,39170,82932329558,78.985371,81.87784,74.980853,85.805463,80.393158


## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)
  
* Create a dataframe to hold the above results

In [410]:
# Create new df for school summary
school_df = school_data_complete.copy()

# add per student budget to df
school_df['Per Student Budget'] = school_df['budget']/school_df['size']

# work out the average pass marks for each school and assign avg column names
avg_pass_marks = school_df.groupby(['school_name'])['math_score', 'reading_score'].mean().reset_index()
avg_pass_marks.rename({'reading_score': 'Avg Reading Score',
                      'math_score':'Avg Math Score' }, axis=1, inplace=True)
# avg_pass_marks

# math pass marks
math_pass_sum = school_df[school_df['math_score']>=70]
# math_pass_sum

# reading pass marks
read_pass_sum = school_df[school_df['reading_score']>=70]
# read_pass_sum

school_df

  


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,Per Student Budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,655.0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,655.0
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,655.0
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,655.0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,655.0
...,...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130,638.0
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130,638.0
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130,638.0
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130,638.0


In [411]:
# count of math pass greater than 70
math_count_sum = math_pass_sum.groupby(['school_name'])['math_score'].count().reset_index()
math_count_sum.rename({'math_score': 'Math Pass Count'}, axis=1, inplace=True)
# math_count_sum

# count of reading pass greater than 70
read_count_sum =  read_pass_sum.groupby(['school_name'])['reading_score'].count().reset_index()
read_count_sum.rename({'reading_score': 'Reading Pass Count'}, axis=1, inplace=True)
# read_count_sum

# Merge count tables together
count_sum = math_count_sum.merge(read_count_sum, on='school_name', how='outer')
count_sum


Unnamed: 0,school_name,Math Pass Count,Reading Pass Count
0,Bailey High School,3318,4077
1,Cabrera High School,1749,1803
2,Figueroa High School,1946,2381
3,Ford High School,1871,2172
4,Griffin High School,1371,1426
5,Hernandez High School,3094,3748
6,Holden High School,395,411
7,Huang High School,1916,2372
8,Johnson High School,3145,3867
9,Pena High School,910,923


In [412]:
# Merge with school summary table to school_df
school_df = school_df.merge(avg_pass_marks, on='school_name',how='outer')
school_df = school_df.merge(count_sum, on='school_name', how='outer')
school_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Pass Count,Reading Pass Count
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,655.0,76.629414,81.182722,1916,2372
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,655.0,76.629414,81.182722,1916,2372
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,655.0,76.629414,81.182722,1916,2372
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,655.0,76.629414,81.182722,1916,2372
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,655.0,76.629414,81.182722,1916,2372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130,638.0,83.418349,83.848930,1525,1591
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130,638.0,83.418349,83.848930,1525,1591
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130,638.0,83.418349,83.848930,1525,1591
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130,638.0,83.418349,83.848930,1525,1591


In [413]:
# calculate pass rates for math and reading
school_df['Math Pass %'] = (school_df['Math Pass Count']/school_df['size'])*100
school_df['Reading Pass %'] = (school_df['Reading Pass Count']/school_df['size'])*100

# Remove count columns as they aren't requested in summary
del school_df['Math Pass Count']
del school_df['Reading Pass Count']

# calculate and add overall pass rate
school_df['Overall Pass %'] = (school_df['Math Pass %']+school_df['Reading Pass %'])/2
school_df



Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Pass %,Reading Pass %,Overall Pass %
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130,638.0,83.418349,83.848930,93.272171,97.308869,95.290520
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130,638.0,83.418349,83.848930,93.272171,97.308869,95.290520
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130,638.0,83.418349,83.848930,93.272171,97.308869,95.290520
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130,638.0,83.418349,83.848930,93.272171,97.308869,95.290520


In [414]:
# aggregate to by school
school_tbl= school_df.groupby('school_name').min().reset_index()

# remove columns that are not required
school_tbl.drop(school_tbl.columns[[1,2,3,4,5,6,7]], axis=1, inplace=True)

# Tidy up column names
school_tbl.rename({'school_name': 'School Name',
                    'type':'School Type',
                    'size': 'No. of Students',
                    'budget': 'Budget'}, axis=1, inplace=True)
school_tbl.head()


Unnamed: 0,School Name,School Type,No. of Students,Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Pass %,Reading Pass %,Overall Pass %
0,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,74.306672
1,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,95.586652
2,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852
3,Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,73.804308
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668


## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

In [415]:
best_performers = school_tbl.sort_values(by=['Overall Pass %'], ascending= False)
best_performers.head()

Unnamed: 0,School Name,School Type,No. of Students,Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Pass %,Reading Pass %,Overall Pass %
1,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,95.586652
12,Thomas High School,Charter,1635,1043130,638.0,83.418349,83.84893,93.272171,97.308869,95.29052
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,95.27027
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,95.265668
13,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,95.203679


## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

In [416]:
worst_performers = school_tbl.sort_values(by=['Overall Pass %'], ascending= True)
worst_performers.head()

Unnamed: 0,School Name,School Type,No. of Students,Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Pass %,Reading Pass %,Overall Pass %
10,Rodriguez High School,District,3999,2547363,637.0,76.842711,80.744686,66.366592,80.220055,73.293323
2,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852
7,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
8,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,66.057551,81.222432,73.639992
3,Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,73.804308


## Math Scores by Grade

* Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

In [417]:
# create averages for each grade for summary table by grade
grades = ['9th', '10th', '11th', '12th']
math_grade = {}
for grade in grades:
    read_grade[grade] = school_data_complete[school_data_complete['grade']==grade].groupby('school_name')['math_score'].mean()
math_grade = pd.DataFrame(math_grade)
math_grade = math_grade.round(2)

math_grade


## Reading Score by Grade 

* Perform the same operations as above for math scores

In [418]:
# create averages for each grade for summary table by grade
grades = ['9th', '10th', '11th', '12th']
read_grade = {}
for grade in grades:
    read_grade[grade] = school_data_complete[school_data_complete['grade']==grade].groupby('school_name')['math_score'].mean()
read_grade = pd.DataFrame(read_grade)
read_grade = read_grade.round(2)

read_grade

Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.08,77.0,77.52,76.49
Cabrera High School,83.09,83.15,82.77,83.28
Figueroa High School,76.4,76.54,76.88,77.15
Ford High School,77.36,77.67,76.92,76.18
Griffin High School,82.04,84.23,83.84,83.36
Hernandez High School,77.44,77.34,77.14,77.19
Holden High School,83.79,83.43,85.0,82.86
Huang High School,77.03,75.91,76.45,77.23
Johnson High School,77.19,76.69,77.49,76.86
Pena High School,83.63,83.37,84.33,84.12


## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [419]:
# Put budget var into df, assign column name and put in decending order
per_student_spend= school_tbl.sort_values(by=["Per Student Budget"], ascending=False)
per_student_spend.head()


Unnamed: 0,School Name,School Type,No. of Students,Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Pass %,Reading Pass %,Overall Pass %
7,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171
5,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,73.807983
8,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,66.057551,81.222432,73.639992
3,Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,73.804308
2,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852


In [420]:
# Create spending bins
spending_bins = [0, 585, 630, 645, 680]
group_names = ["<$585", "$585-630", "$630-645", "$645-680"]
per_student_spend["Spend Range"] = pd.cut(per_student_spend["Per Student Budget"], spending_bins, labels=group_names)

per_student_spend.head()

Unnamed: 0,School Name,School Type,No. of Students,Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Pass %,Reading Pass %,Overall Pass %,Spend Range
7,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,73.500171,$645-680
5,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,73.807983,$645-680
8,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,66.057551,81.222432,73.639992,$645-680
3,Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,73.804308,$630-645
2,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,73.363852,$630-645


In [427]:
# Group data by spending ranges
by_spend = per_student_spend.groupby("Spend Range")
by_spend = pd.DataFrame(by_spend)
by_spend


Unnamed: 0,0,1
0,<$585,School Name School Type No. of St...
1,$585-630,School Name School Type No. of St...
2,$630-645,School Name School Type No. of ...
3,$645-680,School Name School Type No. of S...


## Scores by School Size

* Perform the same operations as above, based on school size.

## Scores by School Type

* Perform the same operations as above, based on school type