In [1]:
# Dependencies
import pandas as pd
import os

In [2]:
# define the path to each file
students_file = os.path.join('Resources','students_complete.csv')
schools_file = os.path.join('Resources','schools_complete.csv')

In [3]:
# load each file into pandas DataFrames
students_df = pd.read_csv(students_file)
schools_df = pd.read_csv(schools_file)

In [4]:
# merge the two DataFrames
complete_df = pd.merge(students_df, schools_df,how = "left", on=["school_name", "school_name"])

In [5]:
# calculate the number of unique school names in the DataFrame
total_schools = len(complete_df['school_name'].unique())

In [6]:
# calculate the total number of students in the DataFrame
total_students = len(complete_df["student_name"])

In [7]:
# calculate the total budget for all schools in the district
total_budget = (complete_df["budget"].unique()).sum()

In [8]:
# calculate the average math score
avg_math_score = complete_df["math_score"].mean()

In [9]:
# calculate the average reading score
avg_reading_score = complete_df["reading_score"].mean()

In [10]:
# list all the students passing math, with a grade of 60 or higher
passing_math = complete_df['math_score'] >= 60
# convert the boolean series to type float and add it to the DataFrame
complete_df['passing math'] = passing_math.astype('float')
# calculate the percentage of students passing math
percent_passing_math = complete_df['passing math'].sum()*100/total_students

In [11]:
# list all the students passing reading, with a grade of 60 or higher
passing_reading = complete_df['reading_score'] >= 60
# convert the boolean series to type float and add it to the DataFrame
complete_df['passing reading'] = passing_reading.astype('float')
# calculate the percentage of students passing reading
percent_passing_reading = complete_df['passing reading'].sum()*100/total_students

In [12]:
# calculate the percentage of students passing both math and reading
passing_math_and_reading = passing_math & passing_reading
complete_df['passing math and reading'] = passing_math_and_reading.astype('float')
percent_overall_passing = passing_math_and_reading.sum()*100/total_students

In [13]:
# create an overview table summarizing key metrics for the entire district
district_summary = pd.DataFrame({"Total Schools": [total_schools],
                           "Total Students": [total_students],
                           "Total Budget": [total_budget],
                           "Average Math Score": [avg_math_score],
                           "Average Reading Score": [avg_reading_score],
                            "% Passing Math":[percent_passing_math],
                            "% Passing Reading":[percent_passing_reading],
                            "% Overall Passing":[percent_overall_passing]
                          })
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,92.445749,100.0,92.445749


In [14]:
# create a grouped DataFrame by school name
grouped_df = complete_df.groupby(['school_name'])

In [15]:
# create a list with all the school names
school_names = complete_df['school_name'].unique()

In [16]:
# create a list of school type by school name
school_type = grouped_df['type'].unique()

In [17]:
# calculate total students in each school
total_students_per_school = grouped_df['student_name'].count()

In [47]:
# calculate total school budget
school_budget = grouped_df['budget'].unique()

In [19]:
# calculate the budget per student
budget_per_student = school_budget/total_students_per_school

In [20]:
# calculate the average math score per school
avg_math_score_per_school = grouped_df['math_score'].mean()

In [21]:
# calculate the average reading score per school
avg_reading_score_per_school = grouped_df['reading_score'].mean()

In [22]:
# calculate the percentage of students that passed math per school
passing_math_per_school = grouped_df['passing math'].sum()
pct_passing_math_per_school = passing_math_per_school*100/total_students_per_school

In [23]:
# calculate the percentage of students that passed reading per school
passing_reading_per_school = grouped_df['passing reading'].sum()
pct_passing_reading_per_school = passing_reading_per_school*100/total_students_per_school

In [24]:
# calculate the overall percentage of students that passed both math and reading per school
passing_math_and_reading_per_school = grouped_df['passing math and reading'].sum()
pct_passing_math_and_reading_per_school = passing_math_and_reading_per_school*100/total_students_per_school

In [25]:
# create an overview table summarizing key metrics for each school
school_summary = pd.DataFrame(list(zip(school_names,school_type,total_students_per_school,
                                       school_budget, budget_per_student,
                                       avg_math_score_per_school,avg_reading_score_per_school,
                                       pct_passing_math_per_school, pct_passing_reading_per_school,
                                       pct_passing_math_and_reading_per_school
                                      )),columns = ['School Name','School Type','Total Students',
                                                   'Total School Budget','Per Student Budget',
                                                   'Average Math Score','Average Reading Score',
                                                   '% Passing Math','% Passing Reading',
                                                   '% Overall Passing'])
school_summary

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,Huang High School,[District],4976,[3124928],[628.0],77.048432,81.033963,89.529743,100.0,89.529743
1,Figueroa High School,[Charter],1858,[1081356],[582.0],83.061895,83.97578,100.0,100.0,100.0
2,Shelton High School,[District],2949,[1884411],[639.0],76.711767,81.15802,88.436758,100.0,88.436758
3,Hernandez High School,[District],2739,[1763916],[644.0],77.102592,80.746258,89.302665,100.0,89.302665
4,Griffin High School,[Charter],1468,[917500],[625.0],83.351499,83.816757,100.0,100.0,100.0
5,Wilson High School,[District],4635,[3022020],[652.0],77.289752,80.934412,89.083064,100.0,89.083064
6,Cabrera High School,[Charter],427,[248087],[581.0],83.803279,83.814988,100.0,100.0,100.0
7,Bailey High School,[District],2917,[1910635],[655.0],76.629414,81.182722,88.858416,100.0,88.858416
8,Holden High School,[District],4761,[3094650],[650.0],77.072464,80.966394,89.182945,100.0,89.182945
9,Pena High School,[Charter],962,[585858],[609.0],83.839917,84.044699,100.0,100.0,100.0


In [26]:
# sort the school summary table in descending order by % overall passing
# include avg math and reading scores in the sorting to ensure the top five have the best scores 
sorted_school_summary = school_summary.sort_values(['% Overall Passing','Average Math Score',
                                                   'Average Reading Score'],ascending = False)

In [27]:
# slice the top five rows to highlight the top 5 performing schools
top_performing_schools = sorted_school_summary.head(5)
top_performing_schools

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
9,Pena High School,[Charter],962,[585858],[609.0],83.839917,84.044699,100.0,100.0,100.0
6,Cabrera High School,[Charter],427,[248087],[581.0],83.803279,83.814988,100.0,100.0,100.0
14,Thomas High School,[Charter],1800,[1049400],[583.0],83.682222,83.955,100.0,100.0,100.0
12,Johnson High School,[Charter],1635,[1043130],[638.0],83.418349,83.84893,100.0,100.0,100.0
11,Rodriguez High School,[Charter],1761,[1056600],[600.0],83.359455,83.725724,100.0,100.0,100.0


In [28]:
# create a table of the bottom five performing 
bottom_performing_schools = sorted_school_summary.tail(5)
bottom_performing_schools

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
8,Holden High School,[District],4761,[3094650],[650.0],77.072464,80.966394,89.182945,100.0,89.182945
5,Wilson High School,[District],4635,[3022020],[652.0],77.289752,80.934412,89.083064,100.0,89.083064
7,Bailey High School,[District],2917,[1910635],[655.0],76.629414,81.182722,88.858416,100.0,88.858416
10,Wright High School,[District],3999,[2547363],[637.0],76.842711,80.744686,88.547137,100.0,88.547137
2,Shelton High School,[District],2949,[1884411],[639.0],76.711767,81.15802,88.436758,100.0,88.436758


In [43]:
# create a new DataFrame grouped by school and by grade
schools_by_grade_df = complete_df.groupby(['school_name','grade'])

In [48]:
# calculate the average math and reading scores per grade in each school
schools_by_grade_df[['math_score','reading_score']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,math_score,reading_score
school_name,grade,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,10th,76.996772,80.907183
Bailey High School,11th,77.515588,80.945643
Bailey High School,12th,76.492218,80.912451
Bailey High School,9th,77.083676,81.303155
Cabrera High School,10th,83.154506,84.253219
Cabrera High School,11th,82.76556,83.788382
Cabrera High School,12th,83.277487,84.287958
Cabrera High School,9th,83.094697,83.676136
Figueroa High School,10th,76.539974,81.408912
Figueroa High School,11th,76.884344,80.640339
