In [1]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# File to Load (Remember to Change These)
school_data_to_load = Path("Resources/schools_complete.csv")
student_data_to_load = Path("Resources/students_complete.csv")

In [3]:
# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load, header=0)
student_data = pd.read_csv(student_data_to_load, header=0)
school_data.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [4]:
# Combine the data into a single dataset.
school_data_complete = pd.merge(student_data, school_data, how="left", on="school_name")
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [5]:
# Part 1: District Summary Calculations
# Calculate the total number of unique schools
school_count = school_data_complete["school_name"].nunique()
print(f"There are {school_count} schools in this district.")

There are 15 schools in this district.


In [6]:
# Calculate the total number of students
student_count = school_data_complete["Student ID"].nunique()
print(f"The district has {student_count} total students enrolled.")

The district has 39170 total students enrolled.


In [7]:
# Calculate the total budget
total_budget = school_data["budget"].sum()
print (f"The total district budget is ${total_budget:,.2f}")

The total district budget is $24,649,428.00


In [8]:
# Average math score
avg_math = school_data_complete["math_score"].mean()
print (f"Students in this district have an average math score of {avg_math:.2f}")

Students in this district have an average math score of 78.99


In [9]:
# Average reading score
avg_reading = school_data_complete["reading_score"].mean()
print (f"Students in this district have an average reading score of {avg_reading:.2f}")

Students in this district have an average reading score of 81.88


In [10]:
# Percentage of students passing math
passing_math = school_data_complete["math_score"] >= 60
passing_math_percent = (passing_math.value_counts()/student_count)*100

passing_math_true = passing_math_percent[True]
print (f"{passing_math_true:.2f}% of students are passing math.")

92.45% of students are passing math.


In [11]:
# Percentage of students passing reading
passing_reading = school_data_complete["reading_score"] >= 60
passing_reading_percent = (passing_reading.value_counts()/student_count)*100

passing_reading_true = passing_reading_percent[True]
print (f"{passing_reading_true:.2f}% of students are passing reading.")

100.00% of students are passing reading.


In [12]:
# Percentage of students passing BOTH math and reading 
passing_both = school_data_complete[(school_data_complete["math_score"] >= 60) & 
                                    (school_data_complete["reading_score"] >= 60)]

percentate_passing_both = (len(passing_both)/len(school_data_complete))*100
print(f"{percentate_passing_both:.2f}% of students are passing both reading and math.")

92.45% of students are passing both reading and math.


In [13]:
# Create a data frame that summarizes the school district values calculated above
# First, create a dictionary of lists
district_summary = {"Total Schools": [school_count],
                 "Total Students": [student_count],
                 "Total Budget": [total_budget],
                 "Average Math Score": [avg_math],
                 "Average Reading Score": [avg_reading],
                 "Percent Passing Math": [passing_math_true],
                 "Percent Passing Reading": [passing_reading_true],
                 "Passing Grade Overall": [percentate_passing_both]}
# Then, call the dictionary as a data frame
district_summary_df = pd.DataFrame(district_summary)
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,Percent Passing Math,Percent Passing Reading,Passing Grade Overall
0,15,39170,24649428,78.985371,81.87784,92.445749,100.0,92.445749


In [14]:
# Part 2: School Summary Calculations

# Create a dataframe that shows the school name and type 
school_name = school_data_complete["school_name"].unique().tolist()
school_type = school_data_complete["type"]

school_summary_df = school_data_complete[["school_name", "type"]]

school_summary_df = school_data_complete.groupby("school_name")["type"].unique().reset_index()
school_summary_df.set_index("school_name", inplace=True)
school_summary_df

Unnamed: 0_level_0,type
school_name,Unnamed: 1_level_1
Bailey High School,[District]
Cabrera High School,[Charter]
Figueroa High School,[District]
Ford High School,[District]
Griffin High School,[Charter]
Hernandez High School,[District]
Holden High School,[Charter]
Huang High School,[District]
Johnson High School,[District]
Pena High School,[Charter]


In [15]:
# Calculate the number of students per school
# Group the data in school data complete by school name and add it to a new variable school_groupby
school_groupby = school_data_complete.groupby(["school_name"])

#Calculate the total number of students per school and add that to a new variable school_counts
school_counts = school_groupby["school_name"].size().tolist()

# add the student student count to the school summary dataframe
school_summary_df["Student Count"] = school_counts
school_summary_df


Unnamed: 0_level_0,type,Student Count
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bailey High School,[District],4976
Cabrera High School,[Charter],1858
Figueroa High School,[District],2949
Ford High School,[District],2739
Griffin High School,[Charter],1468
Hernandez High School,[District],4635
Holden High School,[Charter],427
Huang High School,[District],2917
Johnson High School,[District],4761
Pena High School,[Charter],962


In [16]:
# Caluculate the budget per school
# Use the .mean function to prevent the column from adding together. Print to a list. 
school_budget = school_groupby["budget"].mean().tolist()

# add to the school summary dataframe with the school name as the index
school_summary_df["School Budget"] = school_budget
school_summary_df["School Budget"] = school_summary_df["School Budget"]  
school_summary_df

Unnamed: 0_level_0,type,Student Count,School Budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,[District],4976,3124928.0
Cabrera High School,[Charter],1858,1081356.0
Figueroa High School,[District],2949,1884411.0
Ford High School,[District],2739,1763916.0
Griffin High School,[Charter],1468,917500.0
Hernandez High School,[District],4635,3022020.0
Holden High School,[Charter],427,248087.0
Huang High School,[District],2917,1910635.0
Johnson High School,[District],4761,3094650.0
Pena High School,[Charter],962,585858.0


In [17]:
# Caluclate the budget per student by dividing the school budget by the number of students
student_budget = school_summary_df["School Budget"]/school_summary_df["Student Count"]

# Add the per student budget into the school summary dataframe
school_summary_df["Per Student Budget"] = student_budget
school_summary_df

Unnamed: 0_level_0,type,Student Count,School Budget,Per Student Budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,[District],4976,3124928.0,628.0
Cabrera High School,[Charter],1858,1081356.0,582.0
Figueroa High School,[District],2949,1884411.0,639.0
Ford High School,[District],2739,1763916.0,644.0
Griffin High School,[Charter],1468,917500.0,625.0
Hernandez High School,[District],4635,3022020.0,652.0
Holden High School,[Charter],427,248087.0,581.0
Huang High School,[District],2917,1910635.0,655.0
Johnson High School,[District],4761,3094650.0,650.0
Pena High School,[Charter],962,585858.0,609.0


In [18]:
# Calculate the average math score for each school
# Sum all the math scores for each school
school_math_scores_df = school_data_complete[["school_name", "math_score", "size"]] 
school_math_group = school_math_scores_df.groupby("school_name")["math_score"].mean()

#Add the avg math score to the school summary dataframe
school_summary_df["Avg Math Score"] = school_math_group
school_summary_df
                                                     

Unnamed: 0_level_0,type,Student Count,School Budget,Per Student Budget,Avg Math Score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bailey High School,[District],4976,3124928.0,628.0,77.048432
Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895
Figueroa High School,[District],2949,1884411.0,639.0,76.711767
Ford High School,[District],2739,1763916.0,644.0,77.102592
Griffin High School,[Charter],1468,917500.0,625.0,83.351499
Hernandez High School,[District],4635,3022020.0,652.0,77.289752
Holden High School,[Charter],427,248087.0,581.0,83.803279
Huang High School,[District],2917,1910635.0,655.0,76.629414
Johnson High School,[District],4761,3094650.0,650.0,77.072464
Pena High School,[Charter],962,585858.0,609.0,83.839917


In [19]:
# Calculate the average reading score per school
# Sum all reading scores for each school
school_reading_scores_df = school_data_complete[["school_name", "reading_score"]] 
school_reading_group = school_reading_scores_df.groupby("school_name")["reading_score"].mean()

# Add the avg reading score to the school summary dataframe
school_summary_df["Avg Reading Score"] = school_reading_group
school_summary_df

Unnamed: 0_level_0,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963
Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578
Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802
Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258
Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757
Hernandez High School,[District],4635,3022020.0,652.0,77.289752,80.934412
Holden High School,[Charter],427,248087.0,581.0,83.803279,83.814988
Huang High School,[District],2917,1910635.0,655.0,76.629414,81.182722
Johnson High School,[District],4761,3094650.0,650.0,77.072464,80.966394
Pena High School,[Charter],962,585858.0,609.0,83.839917,84.044699


In [20]:
# Create a boolean value to track pass/fail for each student and add that value back into the dataframe. 
school_math_scores_df.loc[school_math_scores_df['math_score'] >= 70, 'Passing Math'] = 'Pass'
school_math_scores_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  school_math_scores_df.loc[school_math_scores_df['math_score'] >= 70, 'Passing Math'] = 'Pass'


Unnamed: 0,school_name,math_score,size,Passing Math
0,Huang High School,79,2917,Pass
1,Huang High School,61,2917,
2,Huang High School,60,2917,
3,Huang High School,58,2917,
4,Huang High School,84,2917,Pass


In [21]:
# Group by the School Name and "Passing Math" variable created in the last step to get the sum of students passing at each school.  
school_math_group = school_math_scores_df.groupby("school_name")["Passing Math"].value_counts().reset_index(name='Math Count')
school_math_group = school_math_group[['school_name', 'Math Count']]

# Merge the two dataframes together
school_summary_df = pd.merge(school_summary_df, school_math_group, on='school_name', how='left')

# Calculate the percentage of students passing math per school
school_summary_df['Percent Passing Math'] = (school_summary_df['Math Count'] / school_summary_df['Student Count']) * 100 
school_summary_df

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Count,Percent Passing Math
0,Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963,3318,66.680064
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,1749,94.133477
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,1946,65.988471
3,Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258,1871,68.309602
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,1371,93.392371
5,Hernandez High School,[District],4635,3022020.0,652.0,77.289752,80.934412,3094,66.752967
6,Holden High School,[Charter],427,248087.0,581.0,83.803279,83.814988,395,92.505855
7,Huang High School,[District],2917,1910635.0,655.0,76.629414,81.182722,1916,65.683922
8,Johnson High School,[District],4761,3094650.0,650.0,77.072464,80.966394,3145,66.057551
9,Pena High School,[Charter],962,585858.0,609.0,83.839917,84.044699,910,94.594595


In [22]:
#Caluclate the percentage of students per school passing reading

# Create a boolean value to track pass/fail for each student and add that value back into the dataframe. 
school_reading_scores_df.loc[school_reading_scores_df['reading_score'] >= 70, 'Passing Reading'] = 'Pass'
school_reading_scores_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  school_reading_scores_df.loc[school_reading_scores_df['reading_score'] >= 70, 'Passing Reading'] = 'Pass'


Unnamed: 0,school_name,reading_score,Passing Reading
0,Huang High School,66,
1,Huang High School,94,Pass
2,Huang High School,90,Pass
3,Huang High School,67,
4,Huang High School,97,Pass


In [23]:
#Group by the school name and create a new dataframe
school_reading_group = school_reading_scores_df.groupby("school_name")["Passing Reading"].value_counts().reset_index(name='Reading Count')
school_reading_group = school_reading_group[['school_name', 'Reading Count']]

# Merge the two dataframes together
school_summary_df = pd.merge(school_summary_df, school_reading_group, on='school_name', how='left')

# Calculate the percentage of students passing reading per school
school_summary_df['Percent Passing Reading'] = (school_summary_df['Reading Count'] / school_summary_df['Student Count']) * 100 
school_summary_df.head()

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Count,Percent Passing Math,Reading Count,Percent Passing Reading
0,Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963,3318,66.680064,4077,81.93328
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,1749,94.133477,1803,97.039828
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,1946,65.988471,2381,80.739234
3,Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258,1871,68.309602,2172,79.299014
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,1371,93.392371,1426,97.138965


In [24]:
# Calculate the percent of students passing both reading and math at each school. 
school_name = school_data_complete["school_name"]

# Count how many studets are passing both reading AND math
passing_both_count = (school_data_complete['math_score'] >= 70) & (school_data_complete['reading_score'] >= 70)
passing_both_group = {"school_name": school_data_complete["school_name"],
                      "Passing Both": passing_both_count
                       }
passing_both_df = pd.DataFrame(passing_both_group)

# Group by school name to get the total count per school
passing_both_df = passing_both_df.groupby("school_name")["Passing Both"].sum().reset_index(name="Passing Overall")

# Merge the two dataframes together
school_summary_df = pd.merge(school_summary_df, passing_both_df, on='school_name', how='left')

# Calculate the percentage of students passing reading per school
school_summary_df['Percent Passing Overall'] = (school_summary_df['Passing Overall'] / school_summary_df['Student Count']) * 100 
school_summary_df.head()

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Math Count,Percent Passing Math,Reading Count,Percent Passing Reading,Passing Overall,Percent Passing Overall
0,Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963,3318,66.680064,4077,81.93328,2719,54.642283
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,1749,94.133477,1803,97.039828,1697,91.334769
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,1946,65.988471,2381,80.739234,1569,53.204476
3,Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258,1871,68.309602,2172,79.299014,1487,54.289887
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,1371,93.392371,1426,97.138965,1330,90.599455


In [25]:
# Create a dataframe that summarizes key metrics about each school including: School name, School type
# Total students, Total school budget, Per student budget, Average math score, Average reading score,
# % passing math (the percentage of students who passed math), % passing reading (the percentage of students who passed reading)
# % overall passing (the percentage of students who passed math AND reading)

key_metrics_df = school_summary_df.drop(columns=["Math Count", "Reading Count"])
key_metrics_df

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Percent Passing Math,Percent Passing Reading,Passing Overall,Percent Passing Overall
0,Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963,66.680064,81.93328,2719,54.642283
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,1697,91.334769
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,1569,53.204476
3,Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258,68.309602,79.299014,1487,54.289887
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,1330,90.599455
5,Hernandez High School,[District],4635,3022020.0,652.0,77.289752,80.934412,66.752967,80.862999,2481,53.527508
6,Holden High School,[Charter],427,248087.0,581.0,83.803279,83.814988,92.505855,96.252927,381,89.227166
7,Huang High School,[District],2917,1910635.0,655.0,76.629414,81.182722,65.683922,81.316421,1561,53.513884
8,Johnson High School,[District],4761,3094650.0,650.0,77.072464,80.966394,66.057551,81.222432,2549,53.539172
9,Pena High School,[Charter],962,585858.0,609.0,83.839917,84.044699,94.594595,95.945946,871,90.540541


In [26]:
# Sort the schools by Percent Passing Overall, highest to lowest and show top 5.

top_schools_df = key_metrics_df.sort_values("Percent Passing Overall", ascending=False)
top_schools_df.head()

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Percent Passing Math,Percent Passing Reading,Passing Overall,Percent Passing Overall
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,1697,91.334769
12,Thomas High School,[Charter],1635,1043130.0,638.0,83.418349,83.84893,93.272171,97.308869,1487,90.948012
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,1330,90.599455
13,Wilson High School,[Charter],2283,1319574.0,578.0,83.274201,83.989488,93.867718,96.539641,2068,90.582567
9,Pena High School,[Charter],962,585858.0,609.0,83.839917,84.044699,94.594595,95.945946,871,90.540541


In [27]:
# Sort the schools by Percent Passing Overall, lowest to highest
bottom_schools_df = key_metrics_df.sort_values("Percent Passing Overall", ascending=True)
bottom_schools_df.head()

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Percent Passing Math,Percent Passing Reading,Passing Overall,Percent Passing Overall
10,Rodriguez High School,[District],3999,2547363.0,637.0,76.842711,80.744686,66.366592,80.220055,2119,52.988247
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,1569,53.204476
7,Huang High School,[District],2917,1910635.0,655.0,76.629414,81.182722,65.683922,81.316421,1561,53.513884
5,Hernandez High School,[District],4635,3022020.0,652.0,77.289752,80.934412,66.752967,80.862999,2481,53.527508
8,Johnson High School,[District],4761,3094650.0,650.0,77.072464,80.966394,66.057551,81.222432,2549,53.539172


In [28]:
# Create a DataFrame that lists the average math score grouped by school and grade level.
score_by_grade = {"School Name": school_data_complete["school_name"],
                 "Student ID": school_data_complete["Student ID"],
                 "Grade": school_data_complete["grade"],
                 "Math Score": school_data_complete["math_score"],
                 "Reading Score": school_data_complete["reading_score"]}
score_by_grade_df = pd.DataFrame(score_by_grade)

students_count_group = score_by_grade_df.groupby(["School Name", "Grade"])["Math Score"].mean().reset_index(name="Student Average")
students_count_group


Unnamed: 0,School Name,Grade,Student Average
0,Bailey High School,10th,76.996772
1,Bailey High School,11th,77.515588
2,Bailey High School,12th,76.492218
3,Bailey High School,9th,77.083676
4,Cabrera High School,10th,83.154506
5,Cabrera High School,11th,82.76556
6,Cabrera High School,12th,83.277487
7,Cabrera High School,9th,83.094697
8,Figueroa High School,10th,76.539974
9,Figueroa High School,11th,76.884344


In [29]:
# Create a DataFrame that lists the average reading score grouped by school and grade level
score_by_grade = {"School Name": school_data_complete["school_name"],
                 "Student ID": school_data_complete["Student ID"],
                 "Grade": school_data_complete["grade"],
                 "Math Score": school_data_complete["math_score"],
                 "Reading Score": school_data_complete["reading_score"]}
score_by_grade_df = pd.DataFrame(score_by_grade)

students_count_group = score_by_grade_df.groupby(["School Name", "Grade"])["Reading Score"].mean().reset_index(name="Student Average")
students_count_group

Unnamed: 0,School Name,Grade,Student Average
0,Bailey High School,10th,80.907183
1,Bailey High School,11th,80.945643
2,Bailey High School,12th,80.912451
3,Bailey High School,9th,81.303155
4,Cabrera High School,10th,84.253219
5,Cabrera High School,11th,83.788382
6,Cabrera High School,12th,84.287958
7,Cabrera High School,9th,83.676136
8,Figueroa High School,10th,81.408912
9,Figueroa High School,11th,80.640339


In [30]:
key_metrics_df.head()

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Percent Passing Math,Percent Passing Reading,Passing Overall,Percent Passing Overall
0,Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963,66.680064,81.93328,2719,54.642283
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,1697,91.334769
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,1569,53.204476
3,Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258,68.309602,79.299014,1487,54.289887
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,1330,90.599455


In [31]:
# Create a table that breaks down school performance based on average spending ranges (per student)

spending_bins = [0, 585, 630, 645, 680]
labels = ["<$585", "$585-630", "$630-645", "$645-680"]

key_metrics_df["Spending Range Per Student"] = pd.cut(key_metrics_df["Per Student Budget"],
                                        spending_bins, labels=labels,
                                        include_lowest=True)
key_metrics_df.head()

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Percent Passing Math,Percent Passing Reading,Passing Overall,Percent Passing Overall,Spending Range Per Student
0,Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963,66.680064,81.93328,2719,54.642283,$585-630
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,1697,91.334769,<$585
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,1569,53.204476,$630-645
3,Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258,68.309602,79.299014,1487,54.289887,$630-645
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,1330,90.599455,$585-630


In [32]:
# Caluclate the mean scores per spending range

spending_math_scores = key_metrics_df.groupby(["Spending Range Per Student"])["Avg Math Score"].mean()
spending_reading_scores = key_metrics_df.groupby(["Spending Range Per Student"])["Avg Reading Score"].mean()
spending_passing_math = key_metrics_df.groupby(["Spending Range Per Student"])["Percent Passing Math"].mean()
spending_passing_reading = key_metrics_df.groupby(["Spending Range Per Student"])["Percent Passing Reading"].mean()
overall_passing_spending = key_metrics_df.groupby(["Spending Range Per Student"])["Percent Passing Overall"].mean()


# Create a dataframe called spending_summary and include Avg math score, avg reading score, percent passing math, percent passing reading, 
# and percent passing overall. 
spending_summary_df = pd.DataFrame({"Spending Math Scores": spending_math_scores,
                                "Spending Reading Scores": spending_reading_scores,
                                "Spending Passing Math": spending_passing_math,
                                "Spending Passing Reading": spending_passing_reading,
                                "Overall Passing Spending": overall_passing_spending})
spending_summary_df

  spending_math_scores = key_metrics_df.groupby(["Spending Range Per Student"])["Avg Math Score"].mean()
  spending_reading_scores = key_metrics_df.groupby(["Spending Range Per Student"])["Avg Reading Score"].mean()
  spending_passing_math = key_metrics_df.groupby(["Spending Range Per Student"])["Percent Passing Math"].mean()
  spending_passing_reading = key_metrics_df.groupby(["Spending Range Per Student"])["Percent Passing Reading"].mean()
  overall_passing_spending = key_metrics_df.groupby(["Spending Range Per Student"])["Percent Passing Overall"].mean()


Unnamed: 0_level_0,Spending Math Scores,Spending Reading Scores,Spending Passing Math,Spending Passing Reading,Overall Passing Spending
Spending Range Per Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.455399,83.933814,93.460096,96.610877,90.369459
$585-630,81.899826,83.155286,87.133538,92.718205,81.418596
$630-645,78.518855,81.624473,73.484209,84.391793,62.857656
$645-680,76.99721,81.027843,66.164813,81.133951,53.526855


In [33]:
# Create bins with values to group school size

size_bins = [0, 1000, 2000, 5000]
labels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

key_metrics_df["School Size"] = pd.cut(key_metrics_df["Student Count"],
                                        size_bins, labels=labels,
                                        include_lowest=True)
key_metrics_df.head()

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Percent Passing Math,Percent Passing Reading,Passing Overall,Percent Passing Overall,Spending Range Per Student,School Size
0,Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963,66.680064,81.93328,2719,54.642283,$585-630,Large (2000-5000)
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,1697,91.334769,<$585,Medium (1000-2000)
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,1569,53.204476,$630-645,Large (2000-5000)
3,Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258,68.309602,79.299014,1487,54.289887,$630-645,Large (2000-5000)
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,1330,90.599455,$585-630,Medium (1000-2000)


In [34]:
# Caluclate mean scores per size range

size_math_scores = key_metrics_df.groupby(["School Size"])["Avg Math Score"].mean()
size_reading_scores = key_metrics_df.groupby(["School Size"])["Avg Reading Score"].mean()
size_passing_math = key_metrics_df.groupby(["School Size"])["Percent Passing Math"].mean()
size_passing_reading = key_metrics_df.groupby(["School Size"])["Percent Passing Reading"].mean()
size_overall_passing = key_metrics_df.groupby(["School Size"])["Percent Passing Overall"].mean()

size_summary_df = pd.DataFrame({"Size Math Scores": size_math_scores,
                                "Size Reading Scores": size_reading_scores,
                                "Size Passing Math": size_passing_math,
                                "Size Passing Reading": size_passing_reading,
                                "Size Overall Passing": size_overall_passing})
size_summary_df

  size_math_scores = key_metrics_df.groupby(["School Size"])["Avg Math Score"].mean()
  size_reading_scores = key_metrics_df.groupby(["School Size"])["Avg Reading Score"].mean()
  size_passing_math = key_metrics_df.groupby(["School Size"])["Percent Passing Math"].mean()
  size_passing_reading = key_metrics_df.groupby(["School Size"])["Percent Passing Reading"].mean()
  size_overall_passing = key_metrics_df.groupby(["School Size"])["Percent Passing Overall"].mean()


Unnamed: 0_level_0,Size Math Scores,Size Reading Scores,Size Passing Math,Size Passing Reading,Size Overall Passing
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.821598,83.929843,93.550225,96.099437,89.883853
Medium (1000-2000),83.374684,83.864438,93.599695,96.79068,90.621535
Large (2000-5000),77.746417,81.344493,69.963361,82.766634,58.286003


In [35]:
key_metrics_df.head()

Unnamed: 0,school_name,type,Student Count,School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,Percent Passing Math,Percent Passing Reading,Passing Overall,Percent Passing Overall,Spending Range Per Student,School Size
0,Bailey High School,[District],4976,3124928.0,628.0,77.048432,81.033963,66.680064,81.93328,2719,54.642283,$585-630,Large (2000-5000)
1,Cabrera High School,[Charter],1858,1081356.0,582.0,83.061895,83.97578,94.133477,97.039828,1697,91.334769,<$585,Medium (1000-2000)
2,Figueroa High School,[District],2949,1884411.0,639.0,76.711767,81.15802,65.988471,80.739234,1569,53.204476,$630-645,Large (2000-5000)
3,Ford High School,[District],2739,1763916.0,644.0,77.102592,80.746258,68.309602,79.299014,1487,54.289887,$630-645,Large (2000-5000)
4,Griffin High School,[Charter],1468,917500.0,625.0,83.351499,83.816757,93.392371,97.138965,1330,90.599455,$585-630,Medium (1000-2000)


In [62]:
# Create a dataframe that shows school performance based on School Type (District or Charter)
data = {"type": key_metrics_df["type"],
        "Avg Math Score": key_metrics_df["Avg Math Score"],
        "Avg Reading Score": key_metrics_df["Avg Reading Score"],
        "Pct Passing Math": key_metrics_df["Percent Passing Math"],
        "Percent Passing Reading": key_metrics_df["Percent Passing Reading"],
        "Percent Passing Overall": key_metrics_df["Percent Passing Overall"]}
data_df = pd.DataFrame(data)

# Change the type column to a string
data_df["type"] = data_df["type"].astype(str)

# Group the data by type and average the results. 
avg_math = data_df.groupby("type")["Avg Math Score"].mean().reset_index()
avg_reading = data_df.groupby("type")["Avg Reading Score"].mean().reset_index()
pct_math = data_df.groupby("type")["Pct Passing Math"].mean().reset_index()
pct_reading = data_df.groupby("type")["Percent Passing Reading"].mean().reset_index()
pct_overall = data_df.groupby("type") ["Percent Passing Overall"].mean().reset_index()

# create dataframes from the groupby columns
df1 = pd.DataFrame(avg_math)
df2 = pd.DataFrame(avg_reading)
df3 = pd.DataFrame(pct_math)
df4 = pd.DataFrame(pct_reading)
df5 = pd.DataFrame(pct_overall)

merge_1_df = pd.merge(df1, df2, on="type", how="left")
merge_2_df = pd.merge(merge_1_df, df3, on="type", how="left")
merge_3_df = pd.merge(merge_2_df, df4, on="type", how="left")
type_summary_df = pd.merge(merge_3_df, df5, on="type", how="left")
type_summary_df.set_index("type")


Unnamed: 0_level_0,Avg Math Score,Avg Reading Score,Pct Passing Math,Percent Passing Reading,Percent Passing Overall
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
['Charter'],83.473852,83.896421,93.62083,96.586489,90.432244
['District'],76.956733,80.966636,66.548453,80.799062,53.672208
