In [None]:
import pandas as pd
import numpy as np

In [None]:
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

In [None]:
school_data_complete

In [None]:
school_data_stats = school_data_complete.describe()

In [None]:
def get_passing_stats(df):
  num_passing_math = df[df['maths_score'] >= 50].shape[0]
  pct_passing_math = num_passing_math / df.shape[0]

  num_passing_reading = df[df['reading_score'] >= 50].shape[0]
  pct_passing_reading = num_passing_reading / df.shape[0]

  num_passing_overall = df[(df['maths_score'] >= 50) & (df['reading_score'] >= 50)].shape[0]
  pct_passing_overall = num_passing_overall / df.shape[0]

  return [
    pct_passing_math,
    pct_passing_reading,
    pct_passing_overall
  ]

In [None]:
nStudents = school_data_complete["Student ID"].nunique()
nSchools = school_data_complete["School ID"].nunique()
totalBudget = school_data["budget"].sum()
avgReading = school_data_stats.iat[1, 2]
avgMaths = school_data_stats.iat[1, 3]

stats = get_passing_stats(school_data_complete)
pctMathPass, pctReadingPass, pctBothPass = stats[0], stats[1], stats[2]

In [None]:
lga_summary = {
  'Total Schools': nSchools,
  'Total Students': nStudents,
  'Total Budget': totalBudget,
  'Average Maths Score': avgMaths,
  'Average Reading Score': avgReading,
  '% Passing Maths': pctMathPass,
  '% Passing Reading': pctReadingPass,
  '% Overall Passing': pctBothPass
}
area_summary = pd.DataFrame(lga_summary, index=[0])
area_summary['% Passing Maths'] = round(area_summary['% Passing Maths']*100, 2)
area_summary['% Passing Reading'] = round(area_summary['% Passing Reading']*100, 2)
area_summary['% Overall Passing'] = round(area_summary['% Overall Passing']*100, 2)

area_summary.round(2)

In [None]:
grouped_school_complete = school_data_complete.groupby(by = 'School ID')

In [None]:
school_summary_data = []
columns = [
  'School Type', 
  'Total Students', 
  'Total School Budget', 
  'Per Student Budget', 
  'Average Maths Score', 
  'Average Reading Score', 
  '% Passing Maths', 
  '% Passing Reading', 
  '% Overall Passing'
]

index = [
  'Huang High School',
  'Figueroa High School',
  'Shelton High School',
  'Hernandez High School',
  'Griffin High School',
  'Wilson High School',
  'Cabrera High School',
  'Bailey High School',
  'Holden High School',
  'Pena High School',
  'Wright High School',
  'Rodriguez High School',
  'Johnson High School',
  'Ford High School',
  'Thomas High School'
]

for name, group in grouped_school_complete:
  school_type = group.iloc[0]['type']
  total_students = group.shape[0]
  total_budget = group.iloc[0]['budget']
  per_student_budget = total_budget / total_students
  avg_math_score = group['maths_score'].mean()
  avg_reading_score = group['reading_score'].mean()
  passing_stats = get_passing_stats(group)
  
  school_summary_data.append([
    school_type,
    total_students,
    total_budget,
    per_student_budget,
    avg_math_score,
    avg_reading_score,
    passing_stats[0],
    passing_stats[1],
    passing_stats[2]
  ])

per_school_summary = pd.DataFrame(school_summary_data, index = index, columns = columns)


In [None]:
per_school_summary['% Passing Maths'] = round(per_school_summary['% Passing Maths']*100, 2)
per_school_summary['% Passing Reading'] = round(per_school_summary['% Passing Reading']*100, 2)
per_school_summary['% Overall Passing'] = round(per_school_summary['% Overall Passing']*100, 2)
per_school_summary[['Average Maths Score', 'Average Reading Score']] = round(per_school_summary[['Average Maths Score', 'Average Reading Score']], 2)

In [None]:
per_school_summary.sort_index()

## Highest Performing Schools (By % Overall Passing)

In [None]:
top_schools = per_school_summary.sort_values('% Overall Passing', ascending = False).head(5)
top_schools

## Bottom Performing Schools (By % Overall Passing)

In [None]:
bottom_schools = per_school_summary.sort_values('% Overall Passing', ascending = True).head(5)
bottom_schools

## Maths Scores by Year

In [None]:
grouped_school_year = school_data_complete.groupby(['year','school_name'])

maths_averages = {}

for name, group in grouped_school_year:
    school_name, year = name
    avg_score = group['maths_score'].mean()
    if school_name not in maths_averages:
        maths_averages[school_name] = {year: avg_score}
    else:
        maths_averages[school_name][year] = avg_score

maths_scores_by_year = pd.DataFrame(maths_averages)
maths_scores_by_year.rename(columns={'index': 'school_name'}, inplace=True)
maths_scores_by_year.rename(columns=lambda x: f"Year {x}", inplace=True)
maths_scores_by_year.round(2)

## Reading scores by Year

In [None]:
reading_averages = {}

for name, group in grouped_school_year:
    school_name, year = name
    avg_score = group['reading_score'].mean()
    if school_name not in reading_averages:
        reading_averages[school_name] = {year: avg_score}
    else:
        reading_averages[school_name][year] = avg_score

reading_scores_by_year = pd.DataFrame(reading_averages)
reading_scores_by_year.rename(columns={'index': 'school_name'}, inplace=True)
reading_scores_by_year.rename(columns=lambda x: f"Year {x}", inplace=True)
reading_scores_by_year.round(2)

## Scores by School Spending

In [None]:
spending_bins = [0, 585, 630, 645, 680]
spending_names = ["<$585", "$585-630", "$630-645", "$645-680"]

In [None]:
per_school_summary['Spending Range (per student)'] = pd.cut(per_school_summary['Per Student Budget'], bins = spending_bins, labels = spending_names, include_lowest = True)
per_school_summary

In [None]:
grouped_summary_ranges = per_school_summary.groupby('Spending Range (per student)')
mean_summary_ranges = grouped_summary_ranges.mean()
mean_summary_ranges.round(2)

In [None]:
spending_summary = mean_summary_ranges.iloc[:, [3 , 4, 5, 6, 7]]
spending_summary.round(2)

## Scores by School Size

In [None]:
size_bins = [0, 1000, 2000, 5000]
size_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

In [None]:
per_school_summary['School Size'] = pd.cut(per_school_summary['Total Students'], bins = size_bins, labels = size_names, include_lowest = True)
per_school_summary

In [None]:
grouped_summary_size = per_school_summary.groupby('School Size')
size_summary = grouped_summary_size.mean()

In [None]:
size_summary = size_summary.iloc[:, [3 , 4, 5, 6, 7]].round(2)
size_summary

## Scores by School Type

* Perform the same operations as above, based on school type

In [None]:
grouped_summary_type = per_school_summary.groupby('School Type')
type_summary = grouped_summary_type.mean()
type_summary = type_summary.iloc[:, [3 , 4, 5, 6, 7]].round(2)
type_summary