In [303]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

In [304]:
overall_data_df = pd.DataFrame(school_data_complete)
#school_data_df = pd.DataFrame(school_data)
#student_data_df = pd.DataFrame(student_data)

<h1> District Summary </h1>

In [305]:
# Calculate total number of schools and students
total_schools = overall_data_df['school_name'].nunique()
total_students = overall_data_df['student_name'].count()

In [306]:
# Calculate budget total
total_budget = sum(overall_data_df['budget'].unique())


In [307]:
# Format to currency
total_budget = "${0:,.2f}".format(total_budget)
print(total_budget)

$24,649,428.00


In [308]:
# Calc average math and reading score
avg_math = round(overall_data_df['math_score'].mean(), 2)
avg_read = round(overall_data_df['reading_score'].mean(), 2)

In [309]:
# Calc average passing score for math and reading (looking for values at or higher than 70)
math_pass = round(((overall_data_df['math_score'] >= 70).mean())*100, 2)
read_pass = round(((overall_data_df['reading_score'] >= 70).mean())*100, 2)

In [310]:
# Combinning both math and reading data into one column to also calucate passing rate
overall_pass = (pd.concat([overall_data_df['math_score'], overall_data_df['reading_score']]))
overall_passV2 = round(((overall_pass > 70).mean())*100, 2)

In [311]:
# Gather raw data to create new data frame
summary_raw_data = {
    'Total School': [total_schools],
    'Total Students': [total_students],
    'Total Budget': [total_budget],
    'Average Math Score': [avg_math],
    'Average Reading Score': [avg_read],
    '% Passing Math': [math_pass],
    '% Passing Reading': [read_pass],
    '% Overall Passing': [overall_passV2]
}


In [312]:
District_Summary_df = pd.DataFrame(summary_raw_data)
District_Summary_df

Unnamed: 0,Total School,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.99,81.88,74.98,85.81,77.68


<h1> School Summary </h>

In [373]:
overall_data_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [380]:
#Average for math and reading
#overall_data_df.groupby(['school_name', 'type']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Student ID,reading_score,math_score,School ID,size,budget
school_name,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bailey High School,District,20358.5,81.033963,77.048432,7.0,4976.0,3124928.0
Cabrera High School,Charter,16941.5,83.97578,83.061895,6.0,1858.0,1081356.0
Figueroa High School,District,4391.0,81.15802,76.711767,1.0,2949.0,1884411.0
Ford High School,District,36165.0,80.746258,77.102592,13.0,2739.0,1763916.0
Griffin High School,Charter,12995.5,83.816757,83.351499,4.0,1468.0,917500.0
Hernandez High School,District,9944.0,80.934412,77.289752,3.0,4635.0,3022020.0
Holden High School,Charter,23060.0,83.814988,83.803279,8.0,427.0,248087.0
Huang High School,District,1458.0,81.182722,76.629414,0.0,2917.0,1910635.0
Johnson High School,District,32415.0,80.966394,77.072464,12.0,4761.0,3094650.0
Pena High School,Charter,23754.5,84.044699,83.839917,9.0,962.0,585858.0


In [364]:
# identifying columns to select the ones of interest
list(overall_data_df.columns)

['Student ID',
 'student_name',
 'gender',
 'grade',
 'school_name',
 'reading_score',
 'math_score',
 'School ID',
 'type',
 'size',
 'budget']

In [365]:
# Selecting columns of interest
school_dfV2 = overall_data_df.loc[:,['school_name', 'type', 'budget', 'size']]
school_dfV2.head()

Unnamed: 0,school_name,type,budget,size
0,Huang High School,District,1910635,2917
1,Huang High School,District,1910635,2917
2,Huang High School,District,1910635,2917
3,Huang High School,District,1910635,2917
4,Huang High School,District,1910635,2917


In [366]:
# Calculate budget per student and add it to our data frame
school_dfV2['budget_per_student'] = school_dfV2['budget'] / school_dfV2['size']

In [367]:
school_dfV2.head()

Unnamed: 0,school_name,type,budget,size,budget_per_student
0,Huang High School,District,1910635,2917,655.0
1,Huang High School,District,1910635,2917,655.0
2,Huang High School,District,1910635,2917,655.0
3,Huang High School,District,1910635,2917,655.0
4,Huang High School,District,1910635,2917,655.0


In [368]:
school_dfV2 = school_dfV2.groupby(['school_name', 'type', 'size', 'budget', 'budget_per_student']).sum()
school_dfV2 = school_dfV2.reset_index()

In [332]:
# Selecting math and reading score columns
#math_col = overall_data_df.loc[:, 'math_score']
#read_col = overall_data_df.loc[:, 'reading_score']

In [333]:
#calculating average math & reading perschool
#math_col = 