In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [2]:
# total number of schools
total_schools = school_data['school_name'].count()

# total number of students
total_students = student_data['student_name'].count()
# student total will need to be formatted - str can't have calculations performed on them
total_students_formatted = "{:,}".format(total_students)

# total budget - help from kite.com
total_budget = "${:,.2f}".format(school_data['budget'].sum())

# average math score
average_math_score = round(school_data_complete['math_score'].mean(), 6)

# average reading score
average_reading_score = round(school_data_complete['reading_score'].mean(), 6)

# percentage of students with passing math score (70+)
plus70_math = 0
for score in school_data_complete['math_score']:
    if score >= 70:
        plus70_math = plus70_math + 1

percent_math_passing = round(((plus70_math / total_students) * 100), 6)

# percentage of students with passing reading score (70+)
plus70_reading = 0
for score in school_data_complete['reading_score']:
    if score >= 70:
        plus70_reading = plus70_reading + 1

percent_reading_passing = round(((plus70_reading / total_students) * 100), 6)

# percentage of students who passed math *and* reading (% Overall Passing)
plus70_overall = 0
for student in range(len(school_data_complete)):
    if school_data_complete['math_score'][student] >= 70 and school_data_complete['reading_score'][student] >= 70:
        plus70_overall = plus70_overall + 1

percent_overall_passing = round(((plus70_overall / total_students) * 100), 6)
print(percent_overall_passing)

65.172326


In [3]:
# dataframe for district summary
# help from Stack Overflow
district_summary = pd.DataFrame({
    "Total Schools" : total_schools,
    "Total Students" : total_students_formatted,
    "Total Budget" : total_budget,
    "Average Math Score" : average_math_score,
    "Average Reading Score" : average_reading_score,
    "% Passing Math" : percent_math_passing,
    "% Passing Reading" : percent_reading_passing,
    "% Passing Overall" : percent_overall_passing}, index=[0])
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326


In [29]:
# initiate dataframe for school summary
# school name, school type, total students, total budget can be pulled from school_data df
school_summary = school_data.loc[:, ["school_name", "type", "size", "budget"]]
# rename columns
school_summary = school_summary.rename(columns={
    "school_name": "School Name", 
    "type": "School Type", 
    "size": "Total Students", 
    "budget": "Total School Budget"})
# reset index
school_summary = school_summary.set_index("School Name")
# sort schools alphabetically
school_summary = school_summary.sort_values(by="School Name")
# instantiate remaining columns of summary
# because we want 'Per Student Budget' to be formatted as currency, we need it to contain str objects
# the rest can hold floats
school_summary['Per Student Budget'] = "0"
school_summary[['Average Math Score', 'Average Reading Score', '% Passing Math', '% Passing Reading', '% Overall Passing']] = 0
school_summary.head()

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,0,0,0,0,0,0
Cabrera High School,Charter,1858,1081356,0,0,0,0,0,0
Figueroa High School,District,2949,1884411,0,0,0,0,0,0
Ford High School,District,2739,1763916,0,0,0,0,0,0
Griffin High School,Charter,1468,917500,0,0,0,0,0,0


In [30]:
# Per Student Budget
for i in range(len(school_summary)):
    perStudent_budget = school_summary['Total School Budget'][i] / school_summary['Total Students'][i]
    school_summary['Per Student Budget'][i] = "${:,.2f}".format(perStudent_budget)
# Average Math Score
# Average Reading Score
# % Passing Math
# % Passing Reading
# % Overall Passing (The percentage of students that passed math and reading)
school_summary

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  school_summary['Per Student Budget'][i] = "${:,.2f}".format(perStudent_budget)


Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,$628.00,0,0,0,0,0
Cabrera High School,Charter,1858,1081356,$582.00,0,0,0,0,0
Figueroa High School,District,2949,1884411,$639.00,0,0,0,0,0
Ford High School,District,2739,1763916,$644.00,0,0,0,0,0
Griffin High School,Charter,1468,917500,$625.00,0,0,0,0,0
Hernandez High School,District,4635,3022020,$652.00,0,0,0,0,0
Holden High School,Charter,427,248087,$581.00,0,0,0,0,0
Huang High School,District,2917,1910635,$655.00,0,0,0,0,0
Johnson High School,District,4761,3094650,$650.00,0,0,0,0,0
Pena High School,Charter,962,585858,$609.00,0,0,0,0,0


In [None]:
# top performing schools - sort and display the top 5 schools by % overall passing

In [None]:
# bottom performing schools - sort and display the bottom 5 schools by % overall passing

In [None]:
# average math scores by grade

In [None]:
# average reading scores by grade

In [None]:
# scores by school spending
# Average Math Score, Average Reading Score, % Passing Math, % Passing Reading, Overall Passing Rate (Average of the above two)

In [None]:
# scores by school size
# Average Math Score, Average Reading Score, % Passing Math, % Passing Reading, Overall Passing Rate (Average of the above two)

In [None]:
# scores by school type
# Average Math Score, Average Reading Score, % Passing Math, % Passing Reading, Overall Passing Rate (Average of the above two)