# PyCity Schools Analysis

* As a whole, schools with higher budgets, did not yield better test results. By contrast, schools with higher spending per student actually (\$645-675) underperformed compared to schools with smaller budgets (<\$585 per student).

* As a whole, smaller and medium sized schools dramatically out-performed large sized schools on passing math performances (89-91% passing vs 67%).

* As a whole, charter schools out-performed the public district schools across all metrics. However, more analysis will be required to glean if the effect is due to school practices or the fact that charter schools tend to serve smaller student populations per school. 
---

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load 
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

merge_df=pd.merge(school_data,student_data, on="school_name", how="left")
merge_df

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84
...,...,...,...,...,...,...,...,...,...,...,...
39165,14,Thomas High School,Charter,1635,1043130,39165,Donna Howard,F,12th,99,90
39166,14,Thomas High School,Charter,1635,1043130,39166,Dawn Bell,F,10th,95,70
39167,14,Thomas High School,Charter,1635,1043130,39167,Rebecca Tanner,F,9th,73,84
39168,14,Thomas High School,Charter,1635,1043130,39168,Desiree Kidd,F,10th,99,90


## District Summary

In [2]:
# Calculate the Totals (Schools and Students)
total_school = merge_df['school_name'].nunique()
total_students = merge_df['student_name'].count()
total_budget = school_data['budget'].sum()
print(total_school)
print(total_students)
# Calculate the Total Budget

# Calculate the Average Scores
passed_reading = merge_df.loc[merge_df["reading_score"] >= 70]
passed_math = merge_df.loc[merge_df["math_score"] >= 70]
passed_both = merge_df.loc[passed_reading.index & passed_math.index]
    # use df.index to get only the indexes of the records
    # these indices will be used to filter out records in the loc function
pct_passed_reading = (len(passed_reading)/total_students)*100
pct_passed_math = (len(passed_math)/total_students)*100
pct_passed_both = (len(passed_both)/total_students)*100
    # this is more appropriate for analytical purposes
avg_reading_math_passing_rate = (pct_passed_reading + pct_passed_math) / 2
    # this is what the original assignment calls for (the avg of both reading and math passing rates)
print(f'% Passed Reading: {pct_passed_reading}')
print(f'% Passed Math: {pct_passed_math}')
print(f'% Passed Both: {pct_passed_both}')
    # displays the % of students who passed both reading and math (more precise for analytical purposes)
print(f'% Average of Reading and Math Passing Rates: {avg_reading_math_passing_rate}')
#Average Scores
avg_math = merge_df['math_score'].mean()
avg_reading = merge_df['reading_score'].mean()
district_results = [{"Total Schools": total_school, "Total Students": total_students, "Total Budget": total_budget, "Average Math Score":  round(avg_math,2), 
"Average Reading Score":  round(avg_reading,2), "% Passing Math": round(pct_passed_math,2),"% Passing Reading": round(pct_passed_reading,2),
"% Overall Passing Rate": round(pct_passed_both,2)}]

district_summary = pd.DataFrame(district_results)

district_summary["% Passing Math"] = district_summary["% Passing Math"].map("{:,.2f}%".format)
district_summary["% Passing Reading"] = district_summary["% Passing Reading"].map("{:,.2f}%".format)
district_summary["% Overall Passing Rate"] = district_summary["% Overall Passing Rate"].map("{:,.2f}%".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,.2f}".format)
district_summary["Total Students"] = district_summary["Total Students"].map("{:,}".format)

district_summary

15
39170
% Passed Reading: 85.80546336482001
% Passed Math: 74.9808526933878
% Passed Both: 65.17232575950983
% Average of Reading and Math Passing Rates: 80.39315802910392


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,"$24,649,428.00",78.99,81.88,74.98%,85.81%,65.17%


## School Summary

In [3]:
#school_type = merge_df['type']
school_type = school_data.groupby(['school_name'])['type']
#school_type = school_type["type"]
#school_type = merge_df[["school_name","type"]].drop_duplicates()
#school_type

#typeish = merge_df.iloc[:,2]

#school_type1 = ["type"]
total_students = merge_df['school_name'].value_counts()
#print(school_type)
school_group = merge_df.groupby(['school_name'])
schools = school_group.size()
schools
# Calculate the total school budget and per capita spending
school_budget = merge_df.groupby(['school_name']).mean()['budget']
school_budget
per_capita = school_budget / total_students
per_capita
# Calculate the average test scores
math_per = merge_df.groupby(['school_name']).mean()['math_score']
math_per
reading_per = merge_df.groupby(['school_name']).mean()['reading_score']
reading_per
# Calculate the passing scores by creating a filtered data frame
perschool_math = merge_df[(merge_df["math_score"] > 70)]
perschool_reading = merge_df[(merge_df["reading_score"] > 70)]
# Convert to data frame
perschool_reading = perschool_reading.groupby(['school_name']).count()['student_name']/(total_students)*100
perschool_math = perschool_math.groupby(['school_name']).count()['student_name']/(total_students)*100
perschool_both = (perschool_math + perschool_reading)/2
#per_passed_both = ((perschool_both) / 2)*100
# Minor data munging
#"School Type": school_type,


capita_results = {"School Type": school_type,
                   "Total Students": total_students, 
                   "Total School Budget": school_budget, 
                   "Per Student Budget": per_capita,
                   "Average Math Score":  round(math_per,2), 
                    "Average Reading Score":  round(reading_per,2), 
                   "% Passing Math": round(perschool_math,2),
                   "% Passing Reading": round(perschool_reading,2),
                    "% Overall Passing Rate": round(perschool_both,2)}
# Display the data frame
capita_results["Total School Budget"] = capita_results["Total School Budget"].map("${:,.0f}".format)
capita_results["Per Student Budget"] = capita_results["Per Student Budget"].map("${:,.0f}".format)
capita_results["% Passing Math"] = capita_results["% Passing Math"].map("{:,.2f}%".format)      
capita_results["% Passing Reading"] = capita_results["% Passing Reading"].map("{:,.2f}%".format)
capita_results["% Overall Passing Rate"] = capita_results["% Overall Passing Rate"].map("{:,.2f}%".format)

school_summary = pd.DataFrame(capita_results)
school_summary

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Bailey High School,"(Bailey High School, [District])",4976,"$3,124,928",$628,77.05,81.03,64.63%,79.30%,71.97%
Cabrera High School,"(Cabrera High School, [Charter])",1858,"$1,081,356",$582,83.06,83.98,89.56%,93.86%,91.71%
Figueroa High School,"(Figueroa High School, [District])",2949,"$1,884,411",$639,76.71,81.16,63.75%,78.43%,71.09%
Ford High School,"(Ford High School, [District])",2739,"$1,763,916",$644,77.1,80.75,65.75%,77.51%,71.63%
Griffin High School,"(Griffin High School, [Charter])",1468,"$917,500",$625,83.35,83.82,89.71%,93.39%,91.55%
Hernandez High School,"(Hernandez High School, [District])",4635,"$3,022,020",$652,77.29,80.93,64.75%,78.19%,71.47%
Holden High School,"(Holden High School, [Charter])",427,"$248,087",$581,83.8,83.81,90.63%,92.74%,91.69%
Huang High School,"(Huang High School, [District])",2917,"$1,910,635",$655,76.63,81.18,63.32%,78.81%,71.07%
Johnson High School,"(Johnson High School, [District])",4761,"$3,094,650",$650,77.07,80.97,63.85%,78.28%,71.07%
Pena High School,"(Pena High School, [Charter])",962,"$585,858",$609,83.84,84.04,91.68%,92.20%,91.94%


## Top Performing Schools (By Passing Rate)

In [4]:
# Sort and show top five schools
best_five = school_summary.sort_values("% Overall Passing Rate", ascending=False)
best_five.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Wilson High School,"(Wilson High School, [Charter])",2283,"$1,319,574",$578,83.27,83.99,90.93%,93.25%,92.09%
Pena High School,"(Pena High School, [Charter])",962,"$585,858",$609,83.84,84.04,91.68%,92.20%,91.94%
Wright High School,"(Wright High School, [Charter])",1800,"$1,049,400",$583,83.68,83.96,90.28%,93.44%,91.86%
Cabrera High School,"(Cabrera High School, [Charter])",1858,"$1,081,356",$582,83.06,83.98,89.56%,93.86%,91.71%
Holden High School,"(Holden High School, [Charter])",427,"$248,087",$581,83.8,83.81,90.63%,92.74%,91.69%


## Bottom Performing Schools (By Passing Rate)

In [5]:
# Sort and show bottom five schools
worst_five = school_summary.sort_values("% Overall Passing Rate", ascending=True)
worst_five.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Rodriguez High School,"(Rodriguez High School, [District])",3999,"$2,547,363",$637,76.84,80.74,64.07%,77.74%,70.91%
Huang High School,"(Huang High School, [District])",2917,"$1,910,635",$655,76.63,81.18,63.32%,78.81%,71.07%
Johnson High School,"(Johnson High School, [District])",4761,"$3,094,650",$650,77.07,80.97,63.85%,78.28%,71.07%
Figueroa High School,"(Figueroa High School, [District])",2949,"$1,884,411",$639,76.71,81.16,63.75%,78.43%,71.09%
Hernandez High School,"(Hernandez High School, [District])",4635,"$3,022,020",$652,77.29,80.93,64.75%,78.19%,71.47%


## Math Scores by Grade

In [6]:
# Create data series of scores by grade levels using conditionals
ninth_grade = merge_df['grade'] == '9th'
tenth_grade = merge_df['grade'] == '10th'
eleventh_grade = merge_df['grade'] == '11th'
twelfth_grade = merge_df['grade'] == '12th'

score_ninth = merge_df.loc[ninth_grade]
score_tenth = merge_df.loc[tenth_grade]
score_eleventh = merge_df.loc[eleventh_grade]
score_twelfth = merge_df.loc[twelfth_grade]

group_ninth = score_ninth.groupby(["school_name"])
group_tenth = score_tenth.groupby(["school_name"])
group_eleventh = score_eleventh.groupby(["school_name"])
group_twelfth = score_twelfth.groupby(["school_name"])

grading9th = group_ninth["math_score"].mean()
grading10th = group_tenth["math_score"].mean()
grading11th = group_eleventh["math_score"].mean()
grading12th = group_twelfth["math_score"].mean()

grade_value = {"9th":grading9th, "10th":grading10th,"11th":grading11th,"12th":grading12th}

score = pd.DataFrame(grade_value)
score


# Group each by school name

# Combine series into single data frame

# Minor data munging

# Display the data frame


Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


## Reading Score by Grade 

In [7]:
# Create data series of scores by grade levels using conditionals
# Create data series of scores by grade levels using conditionals
ninth_grade = merge_df['grade'] == '9th'
tenth_grade = merge_df['grade'] == '10th'
eleventh_grade = merge_df['grade'] == '11th'
twelfth_grade = merge_df['grade'] == '12th'

score_ninth = merge_df.loc[ninth_grade]
score_tenth = merge_df.loc[tenth_grade]
score_eleventh = merge_df.loc[eleventh_grade]
score_twelfth = merge_df.loc[twelfth_grade]

group_ninth = score_ninth.groupby(["school_name"])
group_tenth = score_tenth.groupby(["school_name"])
group_eleventh = score_eleventh.groupby(["school_name"])
group_twelfth = score_twelfth.groupby(["school_name"])

grading9th = group_ninth["reading_score"].mean()
grading10th = group_tenth["reading_score"].mean()
grading11th = group_eleventh["reading_score"].mean()
grading12th = group_twelfth["reading_score"].mean()

grade_value = {"9th":grading9th, "10th":grading10th,"11th":grading11th,"12th":grading12th}
score = pd.DataFrame(grade_value)
score
# Group each by school name

# Combine series into single data frame

# Minor data munging

# Display the data frame


Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699
Hernandez High School,80.86686,80.660147,81.39614,80.857143
Holden High School,83.677165,83.324561,83.815534,84.698795
Huang High School,81.290284,81.512386,81.417476,80.305983
Johnson High School,81.260714,80.773431,80.616027,81.227564
Pena High School,83.807273,83.612,84.335938,84.59116


## Scores by School Spending

In [4]:
# Establish the bins -- choose any set of bins you would like, but see below for testing bins
# to test, set your bins as follows: [0, 585, 615, 645, 675]
# ALSO -- Note that the values for `% Passing Math`, `% Passing Reading` and `% Overall Passing Rate`
# were computed using averages of averages -- your results may vary if you use weighted averages 
budget_bin = [0, 585, 615, 645, 675]
budget_labels = ["Less than $585","$585 - 615","$615 - 645", "$645 - 675"]
school_summary["Per Student Budget"] = pd.cut(school_summary["Per Student Budget"],budget_bin,labels = budget_labels)
spend_group = school_summary.groupby(["Per Student Budget"]).mean()

average_math = school_summary.groupby(["Per Student Budget"]).mean()["Average Math Score"]
average_reading = school_summary.groupby(["Per Student Budget"]).mean()["Average Reading Score"]
passing_math = school_summary.groupby(["Per Student Budget"]).mean()["% Passing Math"]
passing_reading = school_summary.groupby(["Per Student Budget"]).mean()["% Passing Reading"]
overall = (passing_math + passing_reading) / 2

spend_group = pd.DataFrame({"Average Math Score" : average_math,
      "Average Reading Score": average_reading,
      "% Passing Reading": passing_reading,
      "% Passing Math": passing_math,
      "Overall_Rate":overall})


spend_group
# were computed using averages of averages -- your results may vary if you use weighted averages 


TypeError: '<' not supported between instances of 'int' and 'str'

## Scores by School Size

In [5]:
# Establish the bins 
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]
# Categorize the spending based on the bins
merge_df['size_bins'] = pd.cut(merge_df['size'], size_bins, labels = group_names)
by_size = merge_df.groupby('size_bins')
# Calculate the scores based on bins
avg_math = by_size['math_score'].mean()
avg_read = by_size['math_score'].mean()
pass_math = merge_df[merge_df['math_score'] >= 70].groupby('size_bins')['Student ID'].count()/by_size['Student ID'].count()
pass_read = merge_df[merge_df['reading_score'] >= 70].groupby('size_bins')['Student ID'].count()/by_size['Student ID'].count()
overall = merge_df[(merge_df['reading_score'] >= 70) & (merge_df['math_score'] >= 70)].groupby('size_bins')['Student ID'].count()/by_size['Student ID'].count()
# Assemble into data frame
scores_by_size = pd.DataFrame({
    "Average Math Score": avg_math,
    "Average Reading Score": avg_read,
    '% Passing Math': pass_math,
    '% Passing Reading': pass_read,
    "Overall Passing Rate": overall
})
# Minor data munging
scores_by_size = scores_by_size[[
    "Average Math Score",
    "Average Reading Score",
    '% Passing Math',
    '% Passing Reading',
    "Overall Passing Rate"
]]    
scores_by_size.index.name = "Total Students"
scores_by_size = scores_by_size.reindex(group_names)
# Display results
scores_by_size

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Total Students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.828654,83.828654,0.939525,0.960403,0.901368
Medium (1000-2000),83.372682,83.372682,0.936165,0.967731,0.906243
Large (2000-5000),77.477597,77.477597,0.686524,0.821252,0.56574


## Scores by School Type

In [8]:
# Type | Average Math Score | Average Reading Score | % Passing Math | % Passing Reading | % Overall Passing Rate

# Type | Average Math Score | Average Reading Score | % Passing Math | % Passing Reading | % Overall Passing Rate

average_math = school_summary.groupby(["School Type"]).mean()["Average Math Score"]
average_reading = school_summary.groupby(["School Type"]).mean()["Average Reading Score"]
passing_math = school_summary.groupby(["School Type"]).mean()["% Passing Math"]
passing_reading = school_summary.groupby(["School Type"]).mean()["% Passing Reading"]
overall = (passing_math + passing_reading) / 2

type_summary = pd.DataFrame({"Average Math Score":average_math,
      "Average Reading Score": average_reading,
      "% Passing Reading": passing_reading,
      "% Passing Math": passing_math,
      "Overall_Rate": overall})

type_summary = type_summary[["Average Math Score", 
                             "Average Reading Score",
                             "% Passing Reading",
                             "% Passing Math",
                             "% Overall Rate"]]
type_summary




TypeError: 'Series' objects are mutable, thus they cannot be hashed