# PyCity mSchools Analysis

## Analysis

## Code

In [134]:
import os
from pathlib import Path
import pandas as pd

### Loading the data and looking at properties

In [135]:
schools_path = Path('Resources/schools_complete.csv')
students_path = Path('Resources/students_complete.csv')

schools_df = pd.read_csv(schools_path)
students_df = pd.read_csv(students_path)

In [136]:
#Print columns, shape, and datatypes of dataframes 
for name, df in [('Schools',schools_df), ('Students',students_df)]:
    print('Dataframe for:', name)
    print('Columns:', df.columns)
    print('Shape:', df.shape)
    print('column dtypes:',df.dtypes)
    print('\n')


Dataframe for: Schools
Columns: Index(['School ID', 'school_name', 'type', 'size', 'budget'], dtype='object')
Shape: (15, 5)
column dtypes: School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype: object


Dataframe for: Students
Columns: Index(['Student ID', 'student_name', 'gender', 'grade', 'school_name',
       'reading_score', 'math_score'],
      dtype='object')
Shape: (39170, 7)
column dtypes: Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object




In [137]:
# Count NA values in each column
for name, df in [('Schools',schools_df), ('Students',students_df)]:
    print(f'In dataframe {name}:')
    print(f'Column : count of NA values')
    for col in df.columns:
        print(f'{col} :', df[col].isna().sum())
    print('\n')

In dataframe Schools:
Column : count of NA values
School ID : 0
school_name : 0
type : 0
size : 0
budget : 0


In dataframe Students:
Column : count of NA values
Student ID : 0
student_name : 0
gender : 0
grade : 0
school_name : 0
reading_score : 0
math_score : 0




### Merge dataframes

In [138]:
# Ensure the School ID count matches the school name count in both datasets
# This is a check to ensure that there are no misspelled or other abbreviations used in the school name
# Assumption: School ID is a complete list of unique school identifiers and there are no incorrect entries
len(schools_df['School ID'].unique()) == len(schools_df['school_name'].unique()) == len(students_df['school_name'].unique())

True

In [139]:
merged_df = pd.merge(schools_df, students_df, on='school_name')
merged_df.shape

(39170, 11)

### Calculate District Summary Values

In [140]:
num_unqiue_schools = len(schools_df['School ID'].unique())
num_unqiue_schools

15

In [141]:
total_students = len(students_df['Student ID'].unique())
total_students

39170

In [142]:
total_budget = schools_df['budget'].sum()
total_budget

24649428

In [143]:
avg_math_score = merged_df['math_score'].mean()
avg_reading_score = merged_df['reading_score'].mean()
print(avg_math_score, avg_reading_score)

78.98537145774827 81.87784018381414


In [144]:
perc_pass_math = merged_df['math_score'].ge(70).sum() / total_students
perc_pass_math

0.749808526933878

In [145]:
perc_pass_read = merged_df['reading_score'].ge(70).sum() / total_students
perc_pass_read

0.8580546336482001

In [146]:
perc_pass_both = (merged_df['reading_score'].ge(70) & merged_df['math_score'].ge(70)).sum() / total_students
perc_pass_both

0.6517232575950983

In [147]:
district_summary = pd.DataFrame({
    'Number of unique schools' : [num_unqiue_schools],
    'Total Students' : f'{total_students:,}',
    'Total Budget' : f'${total_budget:,.2f}',
    'Avg Math Score' : f'{avg_math_score:.2f}%',
    'Percent Passing Math' : f'{perc_pass_math:.2%}',
    'Avg Reading Score' : f'{avg_reading_score:.2f}%',
    'Percent Passing Reading' : f'{perc_pass_read:.2%}',
    'Percent Passing Overall' : f'{perc_pass_both:.2%}'
})
district_summary

Unnamed: 0,Number of unique schools,Total Students,Total Budget,Avg Math Score,Percent Passing Math,Avg Reading Score,Percent Passing Reading,Percent Passing Overall
0,15,39170,"$24,649,428.00",78.99%,74.98%,81.88%,85.81%,65.17%


### Calculate School Summary Values

In [149]:
# Use the code provided to select the type per school from school_data
school_types = merged_df.set_index(["school_name"])["type"]

In [152]:
# Make a df copy so that I am not operating on the same memory location as district summary data
groupby_school_ID = merged_df.copy().groupby('School ID')