# PyCitySchools

In [47]:
# dependencies
import pandas as pd
import os

In [48]:
# set file paths
schoolsPath = os.path.join("Resources", "schools_complete.csv")
studentsPath = os.path.join("Resources", "students_complete.csv")

# read in csv files
schools = pd.read_csv(schoolsPath)
#students = pd.read_csv(studentsPath)

In [49]:
# combine both files into one csv dataset
schoolsNstudents = pd.merge(students, schools, on="school_name", how="left")
#schoolsNstudents

In [50]:
# calculate the total number of schools
total_schools = schools["school_name"].count()
#total_schools

In [51]:
# calculate the total number of students
total_students = students["student_name"].count()
#total_students

In [52]:
# calculate the total budget
total_budget = schools["budget"].sum()
#total_budget

In [53]:
# calculate the average math score
average_math = students["math_score"].mean()
#average_math

In [54]:
# calculate the average reading score
average_reading = students["reading_score"].mean()
#average_reading

In [55]:
# calculate the percentage of students with a passing math score (70 or greater)
math_greater_70 = students.loc[students["math_score"] >= 70, :]
math_passing_percent = (len(math_greater_70) / total_students)*100
#math_passing_percent

In [56]:
# calculate the percentage of students with a passing reading score (70 or greater)
reading_greater_70 = students.loc[students["reading_score"] >= 70, :]
reading_passing_percent = (len(reading_greater_70) / total_students)*100
#reading_passing_percent

In [57]:
# calculate the percentage of students who passed math and reading (% Overall Passing)
passing_math_reading = students.loc[(students["math_score"] >= 70) & (students["reading_score"] >= 70), :]
overall_passing_percent = (len(passing_math_reading) / total_students)*100
#overall_passing_percent

In [58]:
# Create a dataframe to hold the above results
district_summary = pd.DataFrame({"Total Schools": [total_schools],
                        "Total Students": [total_students],
                        "Total Budget": [total_budget],
                        "Average Math Score": [average_math],
                        "Average Reading Score": [average_reading],
                        "% Passing Math": [math_passing_percent],
                        "% Passing Reading": [reading_passing_percent],
                        "% Overall Passing": [overall_passing_percent],
                        })

In [13]:
# converting integers to float for formatting
#district_summary["Total Students"] = district_summary["Total Students"].astype(float)
#district_summary["Total Budget"] = district_summary["Total Budget"].astype(float)

### District Summary

In [14]:
# formatting for better read of district summary
district_summary["Total Students"] = district_summary["Total Students"].map('{:,.0f}'.format)
district_summary["Total Budget"] = district_summary["Total Budget"].map('${:,.2f}'.format)
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326


In [59]:
# calculate budget per student by schools
schools["Per Student Budget"] = (schools["budget"] / schools["size"])
schools = schools.sort_values("school_name", ascending=True).reset_index(drop=True)
#schools["Per Student Budget"] = schools["Per Student Budget"].astype(float)
#schools

In [16]:
# calculate the average math scores by schools
math_score_per_school = students.groupby("school_name") ["math_score"].sum()
students_per_school = students.groupby("school_name")["math_score"].count()
average_math_score_df = math_score_per_school / students_per_school
#average_math_score_df

In [17]:
# calculate the average reading scores by schools
reading_score_per_school = students.groupby("school_name") ["reading_score"].sum()
average_reading_score_df = reading_score_per_school / students_per_school
#average_reading_score_df

In [18]:
# calculate the percentage of students with a passing math score (70 or greater) per school
math_greater_70_per_school = students.loc[students["math_score"] >= 70, :]
group_math_greater_70_per_school = math_greater_70_per_school.groupby("school_name") ["math_score"].count()
math_passing_pecent_per_school_df = (group_math_greater_70_per_school / students_per_school) * 100
#math_passing_pecent_per_school_df

In [19]:
# calculate the percentage of students with a passing reading score (70 or greater) per school
reading_greater_70_per_school = students.loc[students["reading_score"] >= 70, :]
group_reading_greater_70_per_school = reading_greater_70_per_school.groupby("school_name") ["reading_score"].count()
reading_passing_pecent_per_school_df = (group_reading_greater_70_per_school / students_per_school) * 100
#reading_passing_pecent_per_school_df

In [20]:
# calculate the percentage of students who passed math and reading (% Overall Passing) per school
passing_math_reading_per_school = students.loc[(students["math_score"] >= 70) & (students["reading_score"] >= 70), :]
group_passing_math_reading_per_school = passing_math_reading_per_school.groupby("school_name") ["school_name"].count()
overall_passing_percent_per_school_df = (group_passing_math_reading_per_school / students_per_school)*100
#overall_passing_percent_per_school_df

In [60]:
# Collect all require student marks df and place in a summary dataframe
student_marks_summary_df = pd.DataFrame({"Average Math Score": average_math_score_df,
                                         "Average Reading Score": average_reading_score_df,
                                         "% Passing Math": math_passing_pecent_per_school_df,
                                         "% Passing Reading": reading_passing_pecent_per_school_df,
                                         "% Overall Passing": overall_passing_percent_per_school_df
                                          }).reset_index()
#student_marks_summary_df

## School Summary

In [22]:
# merge the schools and the student marks summary for school summary output; along with viewable formats, text alignments and hide index
school_summary = pd.merge(schools, student_marks_summary_df, on="school_name", how="left")
del school_summary["School ID"]
school_summary = school_summary.rename(columns={"school_name": "School Name", 
                                                "type": "School Type",
                                                "size": "Total Students",
                                                "budget": "Total School Budget"                                    
                                                })
#school_summary["School Type"] = school_summary["School Type"].astype(float)
#school_summary["Total Students"] = school_summary["Total Students"].astype(float)
#school_summary["Per Student Budget"] = school_summary["Per Student Budget"].astype(float)

#school_summary["Total Students"] = school_summary["Total Students"].map('{:,.0f}'.format)
#school_summary["Total School Budget"] = school_summary["Total School Budget"].map('${:,.2f}'.format)
#school_summary["Per Student Budget"] = school_summary["Per Student Budget"].map('${:.2f}'.format)
#school_summary = school_summary.style.set_properties(**{'text-align': 'left'}).set_table_styles([ dict(selector='th', props=[('text-align', 'left')] )]).hide_index()
school_summary

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,Bailey High School,District,4976,3124928,628.0,77.048432,81.033963,66.680064,81.93328,54.642283
1,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
2,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
3,Ford High School,District,2739,1763916,644.0,77.102592,80.746258,68.309602,79.299014,54.289887
4,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,90.599455
5,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,53.527508
6,Holden High School,Charter,427,248087,581.0,83.803279,83.814988,92.505855,96.252927,89.227166
7,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,53.513884
8,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,66.057551,81.222432,53.539172
9,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,90.540541


## Top 5 Performing Schools (By % Overall Passing)

In [43]:
# Top 5 Performing Schools (By % Overall Passing)
school_summary_sort = school_summary.sort_values("% Overall Passing", ascending=False).reset_index(drop=True)
school_summary_sort.head()

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing,Spending Ranges,School Size Ranges
0,Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578,94.133477,97.039828,91.334769,$580-$594,Medium (1000-2500)
1,Thomas High School,Charter,1635,1043130,638.0,83.418349,83.84893,93.272171,97.308869,90.948012,$625-$639,Medium (1000-2500)
2,Griffin High School,Charter,1468,917500,625.0,83.351499,83.816757,93.392371,97.138965,90.599455,$610-$624,Medium (1000-2500)
3,Wilson High School,Charter,2283,1319574,578.0,83.274201,83.989488,93.867718,96.539641,90.582567,$0-$579,Medium (1000-2500)
4,Pena High School,Charter,962,585858,609.0,83.839917,84.044699,94.594595,95.945946,90.540541,$595-$609,Small (0-1000)


## Bottom 5 Performing Schools (By % Overall Passing)

In [44]:
#Bottom 5 Performing Schools (By % Overall Passing)
school_summary_sort = school_summary.sort_values("% Overall Passing", ascending=True).reset_index(drop=True)
school_summary_sort.head()

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing,Spending Ranges,School Size Ranges
0,Rodriguez High School,District,3999,2547363,637.0,76.842711,80.744686,66.366592,80.220055,52.988247,$625-$639,Large (2500-5000)
1,Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802,65.988471,80.739234,53.204476,$625-$639,Medium (1000-2500)
2,Huang High School,District,2917,1910635,655.0,76.629414,81.182722,65.683922,81.316421,53.513884,$640-$656,Medium (1000-2500)
3,Hernandez High School,District,4635,3022020,652.0,77.289752,80.934412,66.752967,80.862999,53.527508,$640-$656,Large (2500-5000)
4,Johnson High School,District,4761,3094650,650.0,77.072464,80.966394,66.057551,81.222432,53.539172,$640-$656,Large (2500-5000)


In [25]:
# Math Scores by Grade
# Create a table that lists the average Math Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
grade9_math_scores = (students.loc[students["grade"] == "9th"]).groupby("school_name")["math_score"].mean()
grade10_math_scores = (students.loc[students["grade"] == "10th"]).groupby("school_name")["math_score"].mean()
grade11_math_scores = (students.loc[students["grade"] == "11th"]).groupby("school_name")["math_score"].mean()
grade12_math_scores = (students.loc[students["grade"] == "12th"]).groupby("school_name")["math_score"].mean()
#grade12_math_scores

## Math Scores by Grade by Schools

In [63]:
# Collect all require student math marks df by grade by school and place in a summary dataframe
math_scores_per_grade_summary_df = pd.DataFrame({"Grade 9 Math Avg": grade9_math_scores,
                                         "Grade 10 Math Avg": grade10_math_scores,
                                         "Grade 11 Math Avg": grade11_math_scores,
                                         "Grade 12 Math Avg": grade12_math_scores
                                          }).reset_index()
math_scores_per_grade_summary_df

Unnamed: 0,school_name,Grade 9 Math Avg,Grade 10 Math Avg,Grade 11 Math Avg,Grade 12 Math Avg
0,Bailey High School,77.083676,76.996772,77.515588,76.492218
1,Cabrera High School,83.094697,83.154506,82.76556,83.277487
2,Figueroa High School,76.403037,76.539974,76.884344,77.151369
3,Ford High School,77.361345,77.672316,76.918058,76.179963
4,Griffin High School,82.04401,84.229064,83.842105,83.356164
5,Hernandez High School,77.438495,77.337408,77.136029,77.186567
6,Holden High School,83.787402,83.429825,85.0,82.855422
7,Huang High School,77.027251,75.908735,76.446602,77.225641
8,Johnson High School,77.187857,76.691117,77.491653,76.863248
9,Pena High School,83.625455,83.372,84.328125,84.121547


In [27]:
# Reading Scores by Grade
# Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
grade9_reading_scores = (students.loc[students["grade"] == "9th"]).groupby("school_name")["reading_score"].mean()
grade10_reading_scores = (students.loc[students["grade"] == "10th"]).groupby("school_name")["reading_score"].mean()
grade11_reading_scores = (students.loc[students["grade"] == "11th"]).groupby("school_name")["reading_score"].mean()
grade12_reading_scores = (students.loc[students["grade"] == "12th"]).groupby("school_name")["reading_score"].mean()
#grade12_reading_scores

## Reading Scores by Grade by Schools

In [62]:
# Collect all require student reading marks df by grade by school and place in a summary dataframe
reading_scores_per_grade_summary_df = pd.DataFrame({"Grade 9 Reading Avg": grade9_reading_scores,
                                         "Grade 10 Avg Reading Avg": grade10_reading_scores,
                                         "Grade 11 Avg Reading Avg": grade11_reading_scores,
                                         "Grade 12 Avg Reading Avg": grade12_reading_scores
                                          }).reset_index()
reading_scores_per_grade_summary_df

Unnamed: 0,school_name,Grade 9 Reading Avg,Grade 10 Avg Reading Avg,Grade 11 Avg Reading Avg,Grade 12 Avg Reading Avg
0,Bailey High School,81.303155,80.907183,80.945643,80.912451
1,Cabrera High School,83.676136,84.253219,83.788382,84.287958
2,Figueroa High School,81.198598,81.408912,80.640339,81.384863
3,Ford High School,80.632653,81.262712,80.403642,80.662338
4,Griffin High School,83.369193,83.706897,84.288089,84.013699
5,Hernandez High School,80.86686,80.660147,81.39614,80.857143
6,Holden High School,83.677165,83.324561,83.815534,84.698795
7,Huang High School,81.290284,81.512386,81.417476,80.305983
8,Johnson High School,81.260714,80.773431,80.616027,81.227564
9,Pena High School,83.807273,83.612,84.335938,84.59116


In [29]:
# check range of values in column
print(school_summary["Per Student Budget"].min())
print(school_summary["Per Student Budget"].max())

578.0
655.0


In [30]:
# create bins and labels as bins for scores based on spending ranges
bins = [0, 580, 595, 610, 625, 640, 655]
group_labels = ["$0-$579", "$580-$594", "$595-$609", "$610-$624", "$625-$639", "$640-$656"]

In [65]:
# cut Per Student Budget and place into bins
pd.cut(school_summary["Per Student Budget"], bins, labels=group_labels).head()

0    Small (0-1000)
1    Small (0-1000)
2    Small (0-1000)
3    Small (0-1000)
4    Small (0-1000)
Name: Per Student Budget, dtype: category
Categories (3, object): [Small (0-1000) < Medium (1000-2500) < Large (2500-5000)]

In [32]:
# Place the data series into a new column inside of the DataFrame
school_summary["Spending Ranges"] = pd.cut(school_summary["Per Student Budget"], bins, labels=group_labels)
#school_summary

## Scores by School Spending

In [33]:
# Create a GroupBy object based upon "Spending Ranges"
school_summary_group = school_summary.groupby("Spending Ranges")

# Find how many rows fall into each bin
print(school_summary_group["Average Math Score"].count())

# Get the average of each column within the GroupBy object
school_summary_group[["Average Math Score", 
                      "Average Reading Score", 
                      "% Passing Math", 
                      "% Passing Reading", 
                      "% Overall Passing"]].mean()

Spending Ranges
$0-$579      1
$580-$594    3
$595-$609    2
$610-$624    1
$625-$639    4
$640-$656    4
Name: Average Math Score, dtype: int64


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Spending Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
$0-$579,83.274201,83.989488,93.867718,96.539641,90.582567
$580-$594,83.515798,83.915256,93.324222,96.634622,90.298423
$595-$609,83.599686,83.885211,94.230858,95.900287,90.216324
$610-$624,83.351499,83.816757,93.392371,97.138965,90.599455
$625-$639,78.505315,81.6964,73.076824,85.050359,62.945755
$640-$656,77.023555,80.957446,66.70101,80.675217,53.717613


In [34]:
# check range of values in column
print(school_summary["Total Students"].min())
print(school_summary["Total Students"].max())

427
4976


In [35]:
# create bins and labels as bins for scores based on school size
bins = [0, 1000, 3000, 5000]
group_labels = ["Small (0-1000)", "Medium (1000-2500)", "Large (2500-5000)"]

In [36]:
# cut Total Students and place into bins
pd.cut(school_summary["Total Students"], bins, labels=group_labels).head()
#pd.cut(np.array([0, 1000, 2000, 3000, 5000]),3, labels=["Small", "Medium", "Large"])

0     Large (2500-5000)
1    Medium (1000-2500)
2    Medium (1000-2500)
3    Medium (1000-2500)
4    Medium (1000-2500)
Name: Total Students, dtype: category
Categories (3, object): [Small (0-1000) < Medium (1000-2500) < Large (2500-5000)]

In [37]:
# Place the data series into a new column inside of the DataFrame
school_summary["School Size Ranges"] = pd.cut(school_summary["Total Students"], bins, labels=group_labels)
#school_summary

## Scores by School Size

In [38]:
# Create a GroupBy object based upon "School Size Ranges"
school_summary_group = school_summary.groupby("School Size Ranges")

# Find how many rows fall into each bin
print(school_summary_group["Average Math Score"].count())

# Get the average of each column within the GroupBy object
school_summary_group[["Average Math Score", 
                      "Average Reading Score", 
                      "% Passing Math", 
                      "% Passing Reading", 
                      "% Overall Passing"]].mean()

School Size Ranges
Small (0-1000)        2
Medium (1000-2500)    9
Large (2500-5000)     4
Name: Average Math Score, dtype: int64


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Size Ranges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (0-1000),83.821598,83.929843,93.550225,96.099437,89.883853
Medium (1000-2500),81.176821,82.933187,84.649798,91.316412,78.299832
Large (2500-5000),77.06334,80.919864,66.464293,81.059691,53.674303


In [39]:
# check for the different object values in column
print(school_summary["School Type"].unique())

['District' 'Charter']


## Scores by School Types

In [40]:
scores_by_school_type = pd.DataFrame({"Average Math Score": school_summary.groupby("School Type") ["Average Math Score"].mean(), 
                                      "Average Reading Score": school_summary.groupby("School Type") ["Average Reading Score"].mean(),
                                      "% Passing Math": pd.to_numeric(school_summary["% Passing Math"]).groupby(school_summary["School Type"]).mean(),
                                      "% Passing Reading": pd.to_numeric(school_summary["% Passing Reading"]).groupby(school_summary["School Type"]).mean(),
                                      "% Overall Passing": pd.to_numeric(school_summary["% Overall Passing"]).groupby(school_summary["School Type"]).mean()
                                   })
scores_by_school_type

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,93.62083,96.586489,90.432244
District,76.956733,80.966636,66.548453,80.799062,53.672208
