### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [430]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [431]:
# Check df for columns and data 
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [432]:
# Calcuate the number total number of schools
total_schools = len(school_data_complete['School ID'].unique())
# total_schools

# calcuate the number of students
student_count = len(school_data_complete["Student ID"].unique())
# student_count

# Calculate total Budget
total_budget = school_data_complete['budget'].sum()
# total_budget

# Calculate average math score
avg_math_score = school_data_complete['math_score'].mean()
# avg_math_score
# Calcuate average reading score
avg_read_score = school_data_complete['reading_score'].mean()
# avg_read_score



In [433]:
# count math score greater than or equal to 70
math_pass_count = len(school_data_complete[school_data_complete.math_score >=70])
# math_pass_count

# percent of students with math pass score
math_pass_percent = (math_pass_count/student_count)*100

# math_pass_percent


In [434]:
# count math score greater than or equal to 70
read_pass_count = len(school_data_complete[school_data_complete.reading_score >=70])
# read_pass_count

# percent of students with math pass score
read_pass_percent = (read_pass_count/student_count)*100
# read_pass_percent

# Overall pass rate in both reading and math percentage
overall_pass_rate = (math_pass_percent + read_pass_percent)/2

# overall_pass_rate


In [435]:
#create dataframe summary
district_sum = pd.DataFrame({
    'number of schools':[total_schools],
    'number of students':[student_count],
    'total budget':[total_budget],
    'avg math score':[avg_math_score],
    'avg read score':[avg_read_score],
    'math pass percent':[math_pass_percent],
    "read pass percent":[read_pass_percent],
    "overall pass rate":[overall_pass_rate],    
})
district_sum

Unnamed: 0,number of schools,number of students,total budget,avg math score,avg read score,math pass percent,read pass percent,overall pass rate
0,15,39170,82932329558,78.985371,81.87784,74.980853,85.805463,80.393158


## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)
  
* Create a dataframe to hold the above results

In [436]:
# Create new df for school summary
school_df = school_data_complete.copy()

# # create table for type and budget
# school_tbl= school_df.groupby('school_name').min().reset_index()

# # # remove columns that are not required
# school_tbl.drop(school_tbl.columns[[1,2,3,4,5,6,7]], axis=1, inplace=True)
# school_tbl

# add per student budget to df
school_df['Per Student Budget'] = school_df['budget']/school_df['size']

# work out the average pass marks for each school
avg_pass_marks = school_df.groupby(['school_name'])['math_score', 'reading_score'].mean().reset_index()
# avg_pass_marks

# math pass marks
math_pass_sum = school_df[school_df['math_score']>=70]
# math_pass_sum

# reading pass marks
read_pass_sum = school_df[school_df['reading_score']>=70]
# read_pass_sum


school_df

  from ipykernel import kernelapp as app


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,Per Student Budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,655.0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,655.0
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,655.0
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,655.0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,655.0
...,...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130,638.0
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130,638.0
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130,638.0
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130,638.0


In [437]:
# count of math pass greater than 70
math_count_sum = math_pass_sum.groupby(['school_name'])['math_score'].count().reset_index()
math_count_sum.rename({'math_score': 'Math Pass Count'}, axis=1, inplace=True)
# math_count_sum

# count of reading pass greater than 70
read_count_sum =  read_pass_sum.groupby(['school_name'])['reading_score'].count().reset_index()
read_count_sum.rename({'reading_score': 'Reading Pass Count'}, axis=1, inplace=True)
# read_count_sum

# Merge count tables together
count_sum = math_count_sum.merge(read_count_sum, on='school_name', how='outer')
count_sum


Unnamed: 0,school_name,Math Pass Count,Reading Pass Count
0,Bailey High School,3318,4077
1,Cabrera High School,1749,1803
2,Figueroa High School,1946,2381
3,Ford High School,1871,2172
4,Griffin High School,1371,1426
5,Hernandez High School,3094,3748
6,Holden High School,395,411
7,Huang High School,1916,2372
8,Johnson High School,3145,3867
9,Pena High School,910,923


In [438]:

# Merge with school summary table to school_df
school_df = school_df.merge(count_sum, on='school_name', how='outer')
school_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,Per Student Budget,Math Pass Count,Reading Pass Count
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,655.0,1916,2372
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,655.0,1916,2372
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,655.0,1916,2372
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,655.0,1916,2372
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,655.0,1916,2372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130,638.0,1525,1591
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130,638.0,1525,1591
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130,638.0,1525,1591
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130,638.0,1525,1591


In [439]:
# # Merge with school summary table to school_df
# school_df = pd.concat([school_df, school_tbl], axis=1, join='inner')
# school_df

In [440]:
# calculate pass rates for math and reading
school_df['% Math Pass Rate'] = (school_df['Math Pass Count']/school_df['size'])*100
school_df['% Read Pass Rate'] = (school_df['Reading Pass Count']/school_df['size'])*100

# Remove count columns as they aren't requested in summary
del school_df['Math Pass Count']
del school_df['Reading Pass Count']

# calculate and add overall pass rate
school_df['Overall Pass Rate'] = (school_df['% Math Pass Rate']+school_df['% Read Pass Rate'])/2
school_df



Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,Per Student Budget,% Math Pass Rate,% Read Pass Rate,Overall Pass Rate
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,655.0,65.683922,81.316421,73.500171
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,655.0,65.683922,81.316421,73.500171
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,655.0,65.683922,81.316421,73.500171
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,655.0,65.683922,81.316421,73.500171
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,655.0,65.683922,81.316421,73.500171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130,638.0,93.272171,97.308869,95.290520
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130,638.0,93.272171,97.308869,95.290520
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130,638.0,93.272171,97.308869,95.290520
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130,638.0,93.272171,97.308869,95.290520


In [441]:
# aggregate to by school
school_tbl= school_df.groupby('school_name').min().reset_index()

# remove columns that are not required
school_tbl.drop(school_tbl.columns[[1,2,3,4,5,6,7]], axis=1, inplace=True)
school_tbl

Unnamed: 0,school_name,type,size,budget,Per Student Budget,% Math Pass Rate,% Read Pass Rate,Overall Pass Rate
0,Bailey High School,District,4976,3124928,628.0,66.680064,81.93328,74.306672
1,Cabrera High School,Charter,1858,1081356,582.0,94.133477,97.039828,95.586652
2,Figueroa High School,District,2949,1884411,639.0,65.988471,80.739234,73.363852
3,Ford High School,District,2739,1763916,644.0,68.309602,79.299014,73.804308
4,Griffin High School,Charter,1468,917500,625.0,93.392371,97.138965,95.265668
5,Hernandez High School,District,4635,3022020,652.0,66.752967,80.862999,73.807983
6,Holden High School,Charter,427,248087,581.0,92.505855,96.252927,94.379391
7,Huang High School,District,2917,1910635,655.0,65.683922,81.316421,73.500171
8,Johnson High School,District,4761,3094650,650.0,66.057551,81.222432,73.639992
9,Pena High School,Charter,962,585858,609.0,94.594595,95.945946,95.27027


## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

## Math Scores by Grade

* Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

In [442]:
# revist original data for easy reference for grade splitting
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [443]:
# create reading aveage for grades 9 through 12
grade_9 = school_data_complete[school_data_complete['grade']=='9th'].groupby('school_name')['reading_score'].mean()
grade_10 =school_data_complete[school_data_complete['grade']=='10th'].groupby('school_name')['reading_score'].mean()
grade_11 =school_data_complete[school_data_complete['grade']=='11th'].groupby('school_name')['reading_score'].mean()
grade_12 =school_data_complete[school_data_complete['grade']=='12th'].groupby('school_name')['reading_score'].mean()

# combine grades to single df
read_grade_avg = pd.concat([grade_9, grade_10, grade_11, grade_12], axis=1)
read_grade_avg.columns = ['9th Grade Reading Scores', '10th Grade Reading Scores', '11th Grade Reading Scores', '12th Grade Reading Scores']

read_grade_avg = read_grade_avg.round(2)
read_grade_avg

Unnamed: 0_level_0,9th Grade Reading Scores,10th Grade Reading Scores,11th Grade Reading Scores,12th Grade Reading Scores
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.3,80.91,80.95,80.91
Cabrera High School,83.68,84.25,83.79,84.29
Figueroa High School,81.2,81.41,80.64,81.38
Ford High School,80.63,81.26,80.4,80.66
Griffin High School,83.37,83.71,84.29,84.01
Hernandez High School,80.87,80.66,81.4,80.86
Holden High School,83.68,83.32,83.82,84.7
Huang High School,81.29,81.51,81.42,80.31
Johnson High School,81.26,80.77,80.62,81.23
Pena High School,83.81,83.61,84.34,84.59


## Reading Score by Grade 

* Perform the same operations as above for math scores

In [444]:
m_grade_9 = school_data_complete[school_data_complete['grade']=='9th'].groupby('school_name')['math_score'].mean()
m_grade_10 =school_data_complete[school_data_complete['grade']=='10th'].groupby('school_name')['math_score'].mean()
m_grade_11 =school_data_complete[school_data_complete['grade']=='11th'].groupby('school_name')['math_score'].mean()
m_grade_12 =school_data_complete[school_data_complete['grade']=='12th'].groupby('school_name')['math_score'].mean()

# combine grades to single df
math_grade_avg = pd.concat([m_grade_9, m_grade_10, m_grade_11, m_grade_12], axis=1)
math_grade_avg.columns = ['9th Grade Math Scores', '10th Grade Math Scores', '11th Grade Math Scores', '12th Grade Math Scores']

math_grade_avg = math_grade_avg.round(2)
math_grade_avg

Unnamed: 0_level_0,9th Grade Math Scores,10th Grade Math Scores,11th Grade Math Scores,12th Grade Math Scores
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.08,77.0,77.52,76.49
Cabrera High School,83.09,83.15,82.77,83.28
Figueroa High School,76.4,76.54,76.88,77.15
Ford High School,77.36,77.67,76.92,76.18
Griffin High School,82.04,84.23,83.84,83.36
Hernandez High School,77.44,77.34,77.14,77.19
Holden High School,83.79,83.43,85.0,82.86
Huang High School,77.03,75.91,76.45,77.23
Johnson High School,77.19,76.69,77.49,76.86
Pena High School,83.63,83.37,84.33,84.12


## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [445]:
# Put budget var into df, assign column name and put in decending order
budget_per_student = budget_per_student.to_frame('$ Per Student Budget').round(2)
budget_per_student= budget_per_student.sort_values(by=["$ Per Student Budget"], ascending=False)
budget_per_student.head()

NameError: name 'budget_per_student' is not defined

In [None]:
# Create spending bins
spending_bins = [0, 585, 630, 645, 680]
group_names = ["<$585", "$585-630", "$630-645", "$645-680"]
budget_per_student["Spending Ranges (Per Student)"] = pd.cut(budget_per_student["$ Per Student Budget"], spending_bins, labels=group_names)

budget_per_student.head()

In [None]:
# Group data by spending ranges
by_spending = budget_per_student.groupby("Spending Ranges (Per Student)")
by_spending

# Calculate average math score
school_math_score = school_data_complete["math_score"].mean()

# Calculate average reading score
school_reading_score = school_data_complete["reading_score"].mean()


## Scores by School Size

* Perform the same operations as above, based on school size.

## Scores by School Type

* Perform the same operations as above, based on school type