In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
# Open the schools data file
schools = os.path.join("raw_data", "schools_complete.csv")
schools_df = pd.read_csv(schools)
schools_df.head()

Unnamed: 0,School ID,name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [3]:
# Open the students data file
students = os.path.join("raw_data", "students_complete.csv")
students_df = pd.read_csv(students)
students_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [4]:
schools_df.dtypes

School ID     int64
name         object
type         object
size          int64
budget        int64
dtype: object

In [5]:
students_df.dtypes

Student ID        int64
name             object
gender           object
grade            object
school           object
reading_score     int64
math_score        int64
dtype: object

In [6]:
## Part One - District Summary

# Key district metrics for distric summary table

total_schools  = len(schools_df)

total_students = schools_df["size"].sum()

total_budget = schools_df["budget"].sum()

avg_math_score = students_df["math_score"].mean()

avg_reading_score = students_df["reading_score"].mean()

# % Passing Math based on 70
math_pass = students_df.loc[(students_df["math_score"] >= 70)]
math_count = math_pass["math_score"].count()

# Need to pull in total students by school
passing_math = math_count/total_students*100

# % Passing Math based on 70
read_pass = students_df.loc[(students_df["reading_score"] >= 70)]
read_count = read_pass["reading_score"].count()
passing_reading = read_count/total_students*100

overall_passing = (passing_math + passing_reading)/2

In [15]:
# Create district summary table
district_summary_table = pd.DataFrame({"Total Schools": [total_schools],
                                      "Total Students": [total_students],
                                      "Total Budget": [total_budget,],
                                      "Average Math Score": [avg_math_score],
                                      "Average Reading Score": [avg_reading_score],
                                      "% Passing Math": [passing_math],
                                      "% Passing Reading": [passing_reading],
                                      "Overall Passing Score": [overall_passing],
                                      })
district_summary_table = district_summary_table[["Total Schools",
                                                 "Total Students",
                                                 "Total Budget",
                                                 "Average Math Score",
                                                 "Average Reading Score",
                                                 "% Passing Math",
                                                 "% Passing Reading",
                                                 "Overall Passing Score",
                                                 ]]
district_summary_table = district_summary_table.round(2)

# Format table
district_summary_table["Total Students"] = district_summary_table["Total Students"].map("{0:,.0f}".format)
district_summary_table["Total Budget"] = district_summary_table["Total Budget"].map("{0:,.0f}".format)
district_summary_table["% Passing Math"] = district_summary_table["% Passing Math"].map("{0:,.2f}%".format)
district_summary_table["% Passing Reading"] = district_summary_table["% Passing Reading"].map("{0:,.2f}%".format)
district_summary_table["Overall Passing Score"] = district_summary_table["Overall Passing Score"].map("{0:,.2f}%".format)

##Part One Answer
#Print Table
print("District Summary")
district_summary_table

District Summary


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Score
0,15,39170,24649428,78.99,81.88,74.98%,85.81%,80.39%


In [8]:
##Part Two - School Summary

# Add per student budget to schools_df
schools_df["Per Student Budget"] = schools_df["budget"]/schools_df["size"]

# Renames "name" to "school"
schools_df = schools_df.rename(columns={"name": "school"})

# Merge student and school tables
district_df = pd.merge(students_df, schools_df, on="school")
district_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score,School ID,type,size,budget,Per Student Budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,655.0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,655.0
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,655.0
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,655.0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,655.0


In [9]:
# Add pass/fail for math and reading scores, reorder columns to put pass/fail next to actual scores

math_pass_fail = []
for row in district_df['math_score']:
    if row >= 70:
        math_pass_fail.append('Pass')
    else:
        math_pass_fail.append('Fail')
district_df['Math Pass/Fail'] = math_pass_fail

reading_pass_fail = []
for row in district_df['reading_score']:
    if row >= 70:
        reading_pass_fail.append('Pass')
    else:
        reading_pass_fail.append('Fail')
district_df['Reading Pass/Fail'] = reading_pass_fail
district_df.head()

district_df = district_df[["Student ID", "name", "gender", "grade", "school", "School ID", "reading_score", 
                           "Reading Pass/Fail", "math_score", "Math Pass/Fail", "type", "size", "budget", 
                           "Per Student Budget"]]
district_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,School ID,reading_score,Reading Pass/Fail,math_score,Math Pass/Fail,type,size,budget,Per Student Budget
0,0,Paul Bradley,M,9th,Huang High School,0,66,Fail,79,Pass,District,2917,1910635,655.0
1,1,Victor Smith,M,12th,Huang High School,0,94,Pass,61,Fail,District,2917,1910635,655.0
2,2,Kevin Rodriguez,M,12th,Huang High School,0,90,Pass,60,Fail,District,2917,1910635,655.0
3,3,Dr. Richard Scott,M,12th,Huang High School,0,67,Fail,58,Fail,District,2917,1910635,655.0
4,4,Bonnie Ray,F,9th,Huang High School,0,97,Pass,84,Pass,District,2917,1910635,655.0


In [10]:
# Group students_df by school
students_grouped = district_df.groupby(["school"])
students_grouped_desc = pd.DataFrame(students_grouped.describe())

In [11]:
# Key school metrics for school summary report
schools_summary = schools_df[["school", "type", "size", "budget"]]
schools_summary["Per Student Budget"] = schools_summary["budget"]/schools_summary["size"]

## This is not the way to do it.  I think it is inserting the school name and the mean vlue, not just the mean value.
# Calculate mean scores by school
school_mean_math = students_grouped["math_score"].mean()
school_mean_reading = students_grouped["reading_score"].mean()
print(school_mean_math)
schools_summary["Average Math Score"] = school_mean_math
schools_summary["Average Reading Score"] = school_mean_reading
schools_summary

school
Bailey High School       77.048432
Cabrera High School      83.061895
Figueroa High School     76.711767
Ford High School         77.102592
Griffin High School      83.351499
Hernandez High School    77.289752
Holden High School       83.803279
Huang High School        76.629414
Johnson High School      77.072464
Pena High School         83.839917
Rodriguez High School    76.842711
Shelton High School      83.359455
Thomas High School       83.418349
Wilson High School       83.274201
Wright High School       83.682222
Name: math_score, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,school,type,size,budget,Per Student Budget,Average Math Score,Average Reading Score
0,Huang High School,District,2917,1910635,655.0,,
1,Figueroa High School,District,2949,1884411,639.0,,
2,Shelton High School,Charter,1761,1056600,600.0,,
3,Hernandez High School,District,4635,3022020,652.0,,
4,Griffin High School,Charter,1468,917500,625.0,,
5,Wilson High School,Charter,2283,1319574,578.0,,
6,Cabrera High School,Charter,1858,1081356,582.0,,
7,Bailey High School,District,4976,3124928,628.0,,
8,Holden High School,Charter,427,248087,581.0,,
9,Pena High School,Charter,962,585858,609.0,,


In [16]:
##Part Six - Rading Score by Grade
math_scores_grade = pd.pivot_table(district_df, index = ["school"],
                                   values = ("math_score"), 
                                   columns = ["grade"], aggfunc=np.mean)

# Clean up header and reorder columns
math_flattened = pd.DataFrame(math_scores_grade.to_records())
math_flattened = math_flattened[["school", "9th", "10th", "11th", "12th"]]
print("Math Scores by Grade")
math_flattened

Math Scores by Grade


Unnamed: 0,school,9th,10th,11th,12th
0,Bailey High School,77.083676,76.996772,77.515588,76.492218
1,Cabrera High School,83.094697,83.154506,82.76556,83.277487
2,Figueroa High School,76.403037,76.539974,76.884344,77.151369
3,Ford High School,77.361345,77.672316,76.918058,76.179963
4,Griffin High School,82.04401,84.229064,83.842105,83.356164
5,Hernandez High School,77.438495,77.337408,77.136029,77.186567
6,Holden High School,83.787402,83.429825,85.0,82.855422
7,Huang High School,77.027251,75.908735,76.446602,77.225641
8,Johnson High School,77.187857,76.691117,77.491653,76.863248
9,Pena High School,83.625455,83.372,84.328125,84.121547


In [17]:
##Part Six - Rading Score by Grade
reading_scores_grade = pd.pivot_table(district_df, index = ["school"],
                                   values = ("reading_score"), 
                                   columns = ["grade"], aggfunc=np.mean)

# Clean up header and reorder columns
reading_flattened = pd.DataFrame(reading_scores_grade.to_records())
reading_flattened = reading_flattened[["school", "9th", "10th", "11th", "12th"]]
print("Reading Scores by Grade")
reading_flattened

Reading Scores by Grade


Unnamed: 0,school,9th,10th,11th,12th
0,Bailey High School,81.303155,80.907183,80.945643,80.912451
1,Cabrera High School,83.676136,84.253219,83.788382,84.287958
2,Figueroa High School,81.198598,81.408912,80.640339,81.384863
3,Ford High School,80.632653,81.262712,80.403642,80.662338
4,Griffin High School,83.369193,83.706897,84.288089,84.013699
5,Hernandez High School,80.86686,80.660147,81.39614,80.857143
6,Holden High School,83.677165,83.324561,83.815534,84.698795
7,Huang High School,81.290284,81.512386,81.417476,80.305983
8,Johnson High School,81.260714,80.773431,80.616027,81.227564
9,Pena High School,83.807273,83.612,84.335938,84.59116
