In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

In [2]:
overall_data_df = pd.DataFrame(school_data_complete)
#school_data_df = pd.DataFrame(school_data)
#student_data_df = pd.DataFrame(student_data)

<h1> District Summary </h1>

In [26]:
# Calculate total number of schools and students
total_schools = overall_data_df['school_name'].nunique()
total_students = overall_data_df['student_name'].count()

In [27]:
# Calculate budget total
total_budget = sum(overall_data_df['budget'].unique())


In [28]:
# Format to currency
total_budget = "${0:,.2f}".format(total_budget)
print(total_budget)

$24,649,428.00


In [6]:
# Calc average math and reading score
avg_math = round(overall_data_df['math_score'].mean(), 2)
avg_read = round(overall_data_df['reading_score'].mean(), 2)

In [7]:
# Calc average passing score for math and reading (looking for values at or higher than 70)
math_pass = round(((overall_data_df['math_score'] >= 70).mean())*100, 2)
read_pass = round(((overall_data_df['reading_score'] >= 70).mean())*100, 2)

In [46]:
# Combinning both math and reading data into one column to also calucate passing rate
overall_pass = overall_data_df[(overall_data_df['math_score'] >= 70) & (overall_data_df['reading_score'] >= 70)]
#overall_passV2 = round(((overall_pass).count()/(total_students))*100, 2)

In [55]:
overall_passV2 = round((overall_pass.student_name.count()/total_students)*100, 2)

In [56]:
# Gather raw data to create new data frame
summary_raw_data = {
    'Total School': [total_schools],
    'Total Students': [total_students],
    'Total Budget': [total_budget],
    'Average Math Score': [avg_math],
    'Average Reading Score': [avg_read],
    '% Passing Math': [math_pass],
    '% Passing Reading': [read_pass],
    '% Overall Passing': [overall_passV2]
}


In [57]:
District_Summary_df = pd.DataFrame(summary_raw_data)
District_Summary_df

Unnamed: 0,Total School,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.99,81.88,74.98,85.81,65.17


<h1> School Summary </h>

In [58]:
# Making a copy of OG df to add extra columns (dont want to mess with the OG data)
overall_data_plus_avg_df = overall_data_df
overall_data_plus_avg_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,% average_math_scores,% average_reading_scores
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,True,False
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,False,True
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,False,True
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,False,False
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,True,True


In [12]:
overall_data_plus_avg_df['% average_math_scores'] = (overall_data_plus_avg_df.math_score >= 70)
overall_data_plus_avg_df['% average_reading_scores'] = (overall_data_plus_avg_df.reading_score >= 70)
overall_data_plus_avg_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,% average_math_scores,% average_reading_scores
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,True,False
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,False,True
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,False,True
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,False,False
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,True,True


In [51]:
overall_data_plus_avg_df[overall_data_plus_avg_df['school_name'] == 'Bailey High School'].budget.value_counts()

3124928    4976
Name: budget, dtype: int64

In [16]:
#Average for math and reading
school_df = overall_data_plus_avg_df.groupby(['school_name', 'type']).mean().reset_index()
school_df.head()

Unnamed: 0,school_name,type,Student ID,reading_score,math_score,School ID,size,budget,% average_math_scores,% average_reading_scores
0,Bailey High School,District,20358.5,81.033963,77.048432,7.0,4976.0,3124928.0,0.666801,0.819333
1,Cabrera High School,Charter,16941.5,83.97578,83.061895,6.0,1858.0,1081356.0,0.941335,0.970398
2,Figueroa High School,District,4391.0,81.15802,76.711767,1.0,2949.0,1884411.0,0.659885,0.807392
3,Ford High School,District,36165.0,80.746258,77.102592,13.0,2739.0,1763916.0,0.683096,0.79299
4,Griffin High School,Charter,12995.5,83.816757,83.351499,4.0,1468.0,917500.0,0.933924,0.97139


In [17]:
# identifying columns to select the ones of interest
list(school_df.columns)

['school_name',
 'type',
 'Student ID',
 'reading_score',
 'math_score',
 'School ID',
 'size',
 'budget',
 '% average_math_scores',
 '% average_reading_scores']

In [18]:
# Selecting columns of interest
school_df = school_df.loc[:,['school_name', 'type', 'size', 'budget', 'math_score', 'reading_score', '% average_math_scores', '% average_reading_scores']]
school_df.head()

Unnamed: 0,school_name,type,size,budget,math_score,reading_score,% average_math_scores,% average_reading_scores
0,Bailey High School,District,4976.0,3124928.0,77.048432,81.033963,0.666801,0.819333
1,Cabrera High School,Charter,1858.0,1081356.0,83.061895,83.97578,0.941335,0.970398
2,Figueroa High School,District,2949.0,1884411.0,76.711767,81.15802,0.659885,0.807392
3,Ford High School,District,2739.0,1763916.0,77.102592,80.746258,0.683096,0.79299
4,Griffin High School,Charter,1468.0,917500.0,83.351499,83.816757,0.933924,0.97139


In [19]:
# Create per student budget column in df
school_df['per_student_budget'] = school_df['budget']/school_df['size']
school_df.head()

Unnamed: 0,school_name,type,size,budget,math_score,reading_score,% average_math_scores,% average_reading_scores,per_student_budget
0,Bailey High School,District,4976.0,3124928.0,77.048432,81.033963,0.666801,0.819333,628.0
1,Cabrera High School,Charter,1858.0,1081356.0,83.061895,83.97578,0.941335,0.970398,582.0
2,Figueroa High School,District,2949.0,1884411.0,76.711767,81.15802,0.659885,0.807392,639.0
3,Ford High School,District,2739.0,1763916.0,77.102592,80.746258,0.683096,0.79299,644.0
4,Griffin High School,Charter,1468.0,917500.0,83.351499,83.816757,0.933924,0.97139,625.0
