In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
school_data = "Resources/schools_complete.csv"
student_data = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data_df = pd.read_csv(school_data)
student_data_df = pd.read_csv(student_data)


In [2]:
#BUILD SCHOOL SUMMARY FROM SCHOOL_DATA_DF

#Remove the 'school id' column
del school_data_df['School ID']

#Rename columns
school_data_df = school_data_df.rename(columns={"type":"School Type",
                                                "size":"Total Students",
                                                "budget":"Total School Budget"})

#Add Per Student Budget
school_data_df['Per Student Budget'] = school_data_df['Total School Budget'] / school_data_df['Total Students']

#Format columns                                       
school_data_df["Total School Budget"] = school_data_df["Total School Budget"].astype(float).map("${:,.2f}".format)
school_data_df["Per Student Budget"] = school_data_df["Per Student Budget"].astype(float).map("${:,.2f}".format)

In [3]:
#Merge the Original Data Sets
merged_df = pd.merge(school_data_df, student_data_df, on="school_name", how="outer")
merged_df

Unnamed: 0,school_name,School Type,Total Students,Total School Budget,Per Student Budget,Student ID,student_name,gender,grade,reading_score,math_score
0,Huang High School,District,2917,"$1,910,635.00",$655.00,0,Paul Bradley,M,9th,66,79
1,Huang High School,District,2917,"$1,910,635.00",$655.00,1,Victor Smith,M,12th,94,61
2,Huang High School,District,2917,"$1,910,635.00",$655.00,2,Kevin Rodriguez,M,12th,90,60
3,Huang High School,District,2917,"$1,910,635.00",$655.00,3,Dr. Richard Scott,M,12th,67,58
4,Huang High School,District,2917,"$1,910,635.00",$655.00,4,Bonnie Ray,F,9th,97,84
...,...,...,...,...,...,...,...,...,...,...,...
39165,Thomas High School,Charter,1635,"$1,043,130.00",$638.00,39165,Donna Howard,F,12th,99,90
39166,Thomas High School,Charter,1635,"$1,043,130.00",$638.00,39166,Dawn Bell,F,10th,95,70
39167,Thomas High School,Charter,1635,"$1,043,130.00",$638.00,39167,Rebecca Tanner,F,9th,73,84
39168,Thomas High School,Charter,1635,"$1,043,130.00",$638.00,39168,Desiree Kidd,F,10th,99,90


In [4]:
#AVERAGE SCORES PER SCHOOL

#Group students by school name
grouped_students = student_data_df.groupby(['school_name'])

#Gather Average scores per school
avg_math = round(grouped_students["math_score"].mean(), 2)
avg_reading = round(grouped_students["reading_score"].mean(), 2)

In [5]:
#BINS FOR INDIVIDUAL PASSING SCORES

#Label Reading Scores pass
group_names = ["pass"]
bins = [70,100]
merged_df["Reading Summary"] = pd.cut(merged_df["reading_score"], bins, labels=group_names, include_lowest=True)

#Label Math Scores pass
group_names = ["pass"]
bins = [70,100]
merged_df["Math Summary"] = pd.cut(merged_df["math_score"], bins, labels=group_names, include_lowest=True)

In [6]:
#CALCULATE %PASSING FOR READING AND MATH

#Group dataframe by School
grouped_schools_df = merged_df.groupby(['school_name'])

#Count number of passing students per subject, per school
reading_summary_df = grouped_schools_df["Reading Summary"].value_counts()
math_summary_df = grouped_schools_df["Math Summary"].value_counts()

#Create dataframe with passing reading and math scores
test_scores_df = pd.merge(reading_summary_df, math_summary_df, on="school_name", how="outer")

#Merge passing test scores with school data
passing_school_df = pd.merge(test_scores_df, school_data_df, on="school_name", how="outer")
passing_school_df

#Percent Passing Math
passing_school_df["Percent Pass Math"] = round(passing_school_df["Math Summary"]/passing_school_df["Total Students"] * 100, 2).astype(str) + '%'

#Percent Passing Reading
passing_school_df["Percent Pass Reading"] = round(passing_school_df["Reading Summary"]/passing_school_df["Total Students"] * 100, 2).astype(str) + '%'

In [None]:
passing_school_df.head()

In [None]:
merged_df.head()

In [7]:
#Filter only rows that are passing both reading and math
passing_both_df = merged_df.loc[(merged_df["reading_score"] > 69) & (merged_df["math_score"] > 69)]

#Group dataframe by School
passing_grouped_df = passing_both_df.groupby(['school_name'])

new = passing_grouped_df["school_name"].value_counts()

ValueError: cannot insert school_name, already exists

In [None]:
passing_school_df.head()

In [None]:
#Merge with passing_
passing_school_df = pd.merge(pass_only_df, passing_school_df, on="school_name", how="outer")
passing_school_df.head()

In [None]:
#set index to school name
school_data_df = school_data_df.set_index("School Name")

In [None]:
#Group remaining by school
pass_by_school = pass_only_df.groupby(['school_name'])

In [None]:
#Drop rows with at least one failing score
pass_only_df = merged_df.dropna()

#Count remaining entries by school
pass_only = pass_only_df["school_name"].value_counts()
pass_only