In [1]:
#Import Dependencies
import os
import pandas as pd

In [2]:
#Define pathways for datafiles
path_schools = os.path.join("Resources", "schools_complete.csv")
path_students = os.path.join("Resources", "students_complete.csv")

#Load in datafiles
schools_df = pd.read_csv(path_schools)
students_df = pd.read_csv(path_students)

#Merge data files bases on school
school_data_complete = pd.merge(students_df, schools_df, how="left", on=["school_name", "school_name"])
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


## Store Basic info for District

In [10]:
#Create snapshot of the district's key metrics
district_num_schools = len(school_data_complete["school_name"].unique())    #Store number of schools
district_num_students = len(school_data_complete["Student ID"].unique())  #Store number of students

district_by_school = school_data_complete.drop_duplicates(subset = ["school_name"]) #Made a DataFrame where there are not duplicates of schools (one student shown for each school)
district_total_budget = district_by_school["budget"].sum()  #Store total budget for district

# Distict Summary Stats

In [15]:
#Average scores
district_math_avg = school_data_complete["math_score"].mean()
district_reading_avg = school_data_complete["reading_score"].mean()

bins = [0, 70, 100] #Bins for failing (below 70) versus passing with max of 100
labels = ["fail", "pass"]   #Two bins: fail or pass
school_data_complete["pass/fail_reading"] = pd.cut(school_data_complete["reading_score"], bins, labels = labels, include_lowest=True) #Make new column for pass or failing reading
school_data_complete["pass/fail_math"] = pd.cut(school_data_complete["math_score"], bins, labels = labels, include_lowest=True)     #Make new column for pass or failing math

district_math_count = school_data_complete.groupby(["pass/fail_math"]).count()["Student ID"]    #Make series of students who fail and pass math
district_math_perc = district_math_count[1]/(district_math_count.sum())*100     #Calculate passing rate for math

district_reading_count = school_data_complete.groupby(["pass/fail_reading"]).count()["Student ID"]    #Make series of students who fail and pass reading
district_reading_perc = district_reading_count[1]/(district_reading_count.sum())*100    #Calculate passing rate for reading

#Calculate how mamy students passed reading AND math
district_pass_both = school_data_complete.loc[(school_data_complete["pass/fail_math"] == "pass") & (school_data_complete["pass/fail_reading"] == "pass"), ["student_name", "pass/fail_math", "pass/fail_reading"]]
district_pass_both_count = district_pass_both["student_name"].count()
district_pass_both_perc = district_pass_both_count/(school_data_complete["student_name"].count())*100


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,pass/fail_reading,pass/fail_math
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,fail,pass
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,pass,fail
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,pass,fail
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,fail,fail
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,pass,pass


In [16]:

#Create a Data Frame to Hold District Information
district = {
    "Total Schools": [district_num_schools],
    "Total Students": [district_num_students],
    "Total Budget": [district_total_budget],
    "Average Math Score": [district_math_avg],
    "Average Reading Score": [district_reading_avg],
    "% Passing Math": [district_math_perc],
    "% Passing Reading": [district_reading_perc],
    "% Overall Passing": [district_pass_both_perc]
}

pd.DataFrame(district)

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,72.392137,82.971662,60.801634


# School's Summaries

In [5]:
#School Basic info

gby_schools = school_data_complete.groupby(["school_name"]) #Create Groupby object organized by the school

district_by_school_sorted = district_by_school.sort_values("school_name")

school_names = (district_by_school_sorted["school_name"]).values.tolist() #Make a list of school names
school_types = (district_by_school_sorted["type"]).values.tolist()   #make a list of school types that have the same indicies as the lit of school names
school_num_students = (gby_schools["student_name"].count()).values.tolist() #Make a list of the total the number of students for each school
school_budgets = (district_by_school_sorted["budget"]).values.tolist()     #make a list of budgets that have the same indicies as the list of schoosl


In [6]:
#Calculations for per student budget
school_budget_per_student = []
for i in range(district_num_schools):  #Loop through the number of schools
    school_budget_per_student.append((school_budgets[i])/(school_num_students[i]))  #Calculate the budget per student and add to list 


In [19]:
#Calculations for scores 
avg_math_scores = (gby_schools["math_score"].mean()).values.tolist()
avg_read_scores = (gby_schools["reading_score"].mean()).values.tolist()

#############################################################################

#Create df for kids who only passed math. Include columns for school_name and Student ID
passing_math_df = school_data_complete.loc[school_data_complete["pass/fail_math"] == "pass", ["school_name", "Student ID"]]
schools_passing_math_counts = (passing_math_df.groupby("school_name")["Student ID"].count()).values.tolist()    #Groupby school

#Loop through list of school and make a list with the percet of students who passed math
schools_pass_math_perc = []
for i in range(district_num_schools):
    schools_pass_math_perc.append((schools_passing_math_counts[i]/school_num_students[i])*100)

#############################################################################

#Create df for kids who only passed reading. Include columns for school_name and Student ID
passing_reading_df = school_data_complete.loc[school_data_complete["pass/fail_reading"] == "pass", ["school_name", "Student ID"]]
schools_passing_reading_counts = (passing_reading_df.groupby("school_name")["Student ID"].count()).values.tolist()

#Loop through list of school and make a list with the percet of students who passed reading
schools_pass_reading_perc = []
for i in range(district_num_schools):
    schools_pass_reading_perc.append((schools_passing_reading_counts[i]/school_num_students[i])*100)

In [24]:
#Percent of kids who passed both reading AND math at each school
#Create df of kids who passed both reading AND math
schools_pass_df = school_data_complete.loc[(school_data_complete["pass/fail_math"] == "pass") & (school_data_complete["pass/fail_reading"] == "pass"), ["school_name", "Student ID"]]
schools_pass_counts = schools_pass_df.groupby("school_name")["Student ID"].count()  #Groupby school and count number of ids

#Loop through schools and calculate percent at each school
schools_pass_both = []
for i in range(district_num_schools):
    schools_pass_both.append((schools_pass_counts[i])/(school_num_students[i])*100)



In [55]:
#Create Data Frame with Summaries for each school.

school_summary_df = pd.DataFrame({
    "School Name": school_names,
    "School Type": school_types,
    "Total Students": school_num_students,
    "Total budget": school_budgets,
    "Per Student Budget": school_budget_per_student,
    "Average Math Score": avg_math_scores,
    "Average Reading Score": avg_read_scores,
    "% passing math": schools_pass_math_perc,
    "% passing reading": schools_pass_reading_perc,
    "% overall passing": schools_pass_both
})

school_summary_df

Unnamed: 0,School Name,School Type,Total Students,Total budget,Per Student Budget,Average Math Score,Average Reading Score,% passing math,% passing reading,% overall passing
0,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,64.630225,79.300643,51.145498
1,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,89.558665,93.86437,84.01507
2,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,63.750424,78.433367,49.915226
3,Ford High School,District,2739,1763916,644.0,77.102592,80.746258,65.753925,77.51004,51.296093
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,89.713896,93.392371,83.651226
5,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,64.746494,78.187702,50.161812
6,Holden High School,Charter,427,248087,581.0,83.803279,83.814988,90.632319,92.740047,84.074941
7,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,63.318478,78.81385,49.914296
8,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,63.852132,78.281874,49.800462
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,91.683992,92.203742,84.823285


# Highest Performing Schools by % Overall Passing

In [None]:
#Sort school summary df by % overall passing
highest_performing_df = school_summary_df.sort_values("% overall passing", ascending= False)

#Cut the highest performing df to the top 5
highest_performing_df = highest_performing_df.iloc[:5, :]

highest_performing_df

# Lowest Performing Schools by % Overall Passing

In [None]:
#Sort school summary df by % overall passing
lowest_performing_df = school_summary_df.sort_values("% overall passing")

#Cut the lowest performing df to the top 5
lowest_performing_df = lowest_performing_df.iloc[:5, :]

lowest_performing_df

# Math Scores by Grade

In [None]:
math_df = school_data_complete.groupby(["school_name", "grade"])[["math_score"]].mean()

math_df


# Reading Scores by Grade

In [None]:
reading_df = school_data_complete.groupby(["school_name", "grade"])[["reading_score"]].mean()

reading_df


# Scores by Schools spending    

In [70]:
bins_spending = [0, 585, 630, 645, 680 ] #Bins with $20 range
labels_spending = ["<585", "585-630", "630-645", "645-680"]   #Four bins 
spending_classifications = pd.cut(school_summary_df["Per Student Budget"], bins_spending, labels = labels_spending, include_lowest=True) #Make list

school_summary_spending_df = school_summary_df
school_summary_spending_df["Spending Classifications"] = spending_classifications
spending_df = school_summary_df.groupby("Spending Classifications")[["Average Math Score", "Average Reading Score", "% passing math", "% passing reading", "% overall passing"]].mean()
spending_df


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% passing math,% passing reading,% overall passing
Spending Classifications,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<585,83.455399,83.933814,90.350436,93.325838,84.35569
585-630,81.899826,83.155286,83.980055,89.378647,75.702844
630-645,78.518855,81.624473,70.946108,81.648261,58.732506
645-680,76.99721,81.027843,63.972368,78.427809,49.958857


# Scores by School Size

In [72]:
bins_size = [0, 1000, 2000, 5000] #Bins for school size (pop)
labels_size = ["Small (<1000)", "Medium (1000-2000", "Large (2000-3000"]   #Three bins 
school_size_classifications = pd.cut(school_summary_df["Total Students"], bins_size, labels = labels_size, include_lowest=True) #Make list

school_summary_size_df = school_summary_df
school_summary_size_df["Scores by School Size"] = school_size_classifications
scorebysize_df = school_summary_size_df.groupby("Scores by School Size")[["Average Math Score", "Average Reading Score", "% passing math", "% passing reading", "% overall passing"]].mean()
scorebysize_df

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% passing math,% passing reading,% overall passing
Scores by School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.821598,83.929843,91.158155,92.471895,84.449113
Medium (1000-2000,83.374684,83.864438,89.931303,93.244843,83.916691
Large (2000-3000,77.746417,81.344493,67.631335,80.1908,54.569881


# Scores by School Type 

In [73]:
scoresbytype_df = school_summary_df.groupby("School Type")[["Average Math Score", "Average Reading Score", "% passing math", "% passing reading", "% overall passing"]].mean()
scoresbytype_df

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% passing math,% passing reading,% overall passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,90.363226,93.052812,84.171248
District,76.956733,80.966636,64.302528,78.324559,50.238678
