In [1]:
# Dependencies and Setup
import pandas as pd


# File to Load (Remember to change the path if needed.)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read the School Data and Student Data and store into a Pandas DataFrame
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

# Cleaning Student Names and Replacing Substrings in a Python String
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

# Check names.
student_data_df.head(10)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80
7,7,Nicole Baker,F,12th,Huang High School,96,69
8,8,Michael Roth,M,10th,Huang High School,95,87
9,9,Matthew Greene,M,10th,Huang High School,96,84


In [2]:
# Combine the data into a single dataset
school_data_complete_df = pd.merge(student_data_df, school_data_df, how="left", on=["school_name", "school_name"])
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


## Deliverable 1: Replace the reading and math scores.

### Replace the 9th grade reading and math scores at Thomas High School with NaN.

In [3]:
# Original District Summary data frame
# Get the total number of students.
# Get the total number of students.

# Calculate the total number of schools.
school_count = school_data_df["school_name"].count()
# Calculate the total number of schools.
student_count = student_data_df["student_name"].count()
# Calculate the total budget.
total_budget = school_data_df["budget"].sum()
# Calculate the average reading score.
average_reading_score = school_data_complete_df["reading_score"].mean()
# Calculate the average math score.
average_math_score = school_data_complete_df["math_score"].mean()
# Get all the students who are passing math in a new DataFrame.
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
# Get all the students that are passing reading in a new DataFrame.
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]
# Calculate the number of students passing math.
passing_math_count = passing_math["student_name"].count()
# Calculate the number of students passing reading.
passing_reading_count = passing_reading["student_name"].count()
# Calculate the percent that passed math.
passing_math_percentage = passing_math_count / float(student_count) * 100
# Calculate the percent that passed reading.
passing_reading_percentage = passing_reading_count / float(student_count) * 100
# Calculate the students who passed both math and reading.
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]
# Calculate the number of students who passed both math and reading.
overall_passing_math_reading_count = passing_math_reading["student_name"].count()
# Calculate the overall passing percentage.
overall_passing_percentage = overall_passing_math_reading_count / student_count * 100

# Adding a list of values with keys to create a new DataFrame.
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math_score,
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])

# Define a function that calculates the percentage of students that passed both # math and reading and prints the passing percentage to the output when the
# function is called.
def passing_math_percent(pass_math_count, student_count):
    return pass_math_count / float(student_count) * 100
# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
# Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a "$"
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
# Format the columns.
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)
# Reorder the columns in the order you want them to appear.
new_column_order = ["Total Schools", "Total Students", "Total Budget","Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing"]

# Assign district summary df the new column order.
district_summary_df = district_summary_df[new_column_order]
district_summary_df


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65


In [4]:
# Install numpy using conda install numpy or pip install numpy. 
# Step 1. Import numpy as np.
import numpy as np

In [5]:
# Step 2. Use the loc method on the student_data_df to select all the 
#reading scores from the 9th grade at Thomas High School and replace them with NaN.

school_data_complete_df.loc[(school_data_complete_df["school_name"] == "Thomas High School") & (school_data_complete_df["grade"] == "9th"),
                            ["reading_score"]] = np.nan

                                              



In [6]:
school_data_complete_df.tail(20)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
39150,39150,Jennifer Hamilton,F,11th,Thomas High School,80.0,75,14,Charter,1635,1043130
39151,39151,Shannon Williams,F,10th,Thomas High School,84.0,73,14,Charter,1635,1043130
39152,39152,Lori Moore,F,9th,Thomas High School,,84,14,Charter,1635,1043130
39153,39153,William Hubbard,M,9th,Thomas High School,,75,14,Charter,1635,1043130
39154,39154,Bradley Johnson,M,12th,Thomas High School,91.0,71,14,Charter,1635,1043130
39155,39155,John Brooks,M,10th,Thomas High School,92.0,98,14,Charter,1635,1043130
39156,39156,Stephanie Contreras,F,11th,Thomas High School,79.0,95,14,Charter,1635,1043130
39157,39157,Kristen Gonzalez,F,9th,Thomas High School,,94,14,Charter,1635,1043130
39158,39158,Kari Holloway,F,10th,Thomas High School,87.0,90,14,Charter,1635,1043130
39159,39159,Kimberly Cabrera,F,11th,Thomas High School,85.0,72,14,Charter,1635,1043130


In [7]:
#  Step 3. Refactor the code in Step 2 to replace the math scores with NaN.
school_data_complete_df.loc[(school_data_complete_df["school_name"] == "Thomas High School") & (school_data_complete_df["grade"] == "9th"),
                            ["math_score"]] = np.nan

                                              

In [8]:
#  Step 4. Check the student data for NaN's. 
school_data_complete_df.tail(20)


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
39150,39150,Jennifer Hamilton,F,11th,Thomas High School,80.0,75.0,14,Charter,1635,1043130
39151,39151,Shannon Williams,F,10th,Thomas High School,84.0,73.0,14,Charter,1635,1043130
39152,39152,Lori Moore,F,9th,Thomas High School,,,14,Charter,1635,1043130
39153,39153,William Hubbard,M,9th,Thomas High School,,,14,Charter,1635,1043130
39154,39154,Bradley Johnson,M,12th,Thomas High School,91.0,71.0,14,Charter,1635,1043130
39155,39155,John Brooks,M,10th,Thomas High School,92.0,98.0,14,Charter,1635,1043130
39156,39156,Stephanie Contreras,F,11th,Thomas High School,79.0,95.0,14,Charter,1635,1043130
39157,39157,Kristen Gonzalez,F,9th,Thomas High School,,,14,Charter,1635,1043130
39158,39158,Kari Holloway,F,10th,Thomas High School,87.0,90.0,14,Charter,1635,1043130
39159,39159,Kimberly Cabrera,F,11th,Thomas High School,85.0,72.0,14,Charter,1635,1043130


## Deliverable 2 : Repeat the school district analysis

### District Summary

In [9]:
# Calculate the Totals (Schools and Students)
school_count = len(school_data_complete_df["school_name"].unique())
student_count = school_data_complete_df["Student ID"].count()

# Calculate the Total Budget
total_budget = school_data_df["budget"].sum()

In [10]:
# Calculate the Average Scores using the "clean_student_data".
average_reading_score = school_data_complete_df["reading_score"].mean()
average_math_score = school_data_complete_df["math_score"].mean()

In [11]:
# Step 1. Get the number of students that are in ninth grade at Thomas High School.
# These students have no grades. 
no_grades= school_data_complete_df["reading_score"].count()
no_grades
# Get the total student count 
student_count = school_data_complete_df["Student ID"].count()
student_count

# Step 2. Subtract the number of students that are in ninth grade at 
# Thomas High School from the total student count to get the new total student count.
Thomas_9th_count= student_count-no_grades
Thomas_9th_count
new_student_count= student_count- Thomas_9th_count
new_student_count

38709

In [12]:
# Calculate the passing rates using the "clean_student_data".
passing_math_count = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)].count()["student_name"]
passing_reading_count = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)].count()["student_name"]

In [13]:
# Step 3. Calculate the passing percentages with the new total student count.
# Calculate the percent that passed math.
passing_math_percentage = passing_math_count / float(new_student_count) * 100

# Calculate the percent that passed reading.
passing_reading_percentage = passing_reading_count / float(new_student_count) * 100

In [14]:
# Calculate the students who passed both reading and math.
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)
                                               & (school_data_complete_df["reading_score"] >= 70)]

# Calculate the number of students that passed both reading and math.
overall_passing_math_reading_count = passing_math_reading["student_name"].count()


# Step 4.Calculate the overall passing percentage with new total student count.
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]
overall_passing_math_reading_count = passing_math_reading["student_name"].count()
overall_passing_percentage = overall_passing_math_reading_count / new_student_count * 100

In [36]:
# Create a DataFrame
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count, 
          "Total Students": student_count, 
          "Total Budget": total_budget,
          "Average Math Score": average_math_score, 
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])



# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
# Format the "Total Budget" to have the comma for a thousands separator, a decimal separator and a "$".
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
# Format the columns.
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.1f}".format)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.1f}".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.1f}".format)

# Display the data frame
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.9,81.9,74.8,85.7,64.9


##  School Summary

In [38]:
# Determine the School Type
per_school_types = school_data_df.set_index(["school_name"])["type"]

# Calculate the total student count.
per_school_counts = school_data_complete_df["school_name"].value_counts()

# Calculate the total school budget and per capita spending
per_school_budget = school_data_complete_df.groupby(["school_name"]).mean()["budget"]
# Calculate the per capita spending.
per_school_capita = per_school_budget / per_school_counts

# Calculate the average test scores.
per_school_math = school_data_complete_df.groupby(["school_name"]).mean()["math_score"]
per_school_reading = school_data_complete_df.groupby(["school_name"]).mean()["reading_score"]

# Calculate the passing scores by creating a filtered DataFrame.
per_school_passing_math = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)]
per_school_passing_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)]

# Calculate the number of students passing math and passing reading by school.
per_school_passing_math = per_school_passing_math.groupby(["school_name"]).count()["student_name"]
per_school_passing_reading = per_school_passing_reading.groupby(["school_name"]).count()["student_name"]

# Calculate the percentage of passing math and reading scores per school.
per_school_passing_math = per_school_passing_math / per_school_counts * 100
per_school_passing_reading = per_school_passing_reading / per_school_counts * 100

# Calculate the students who passed both reading and math.
per_passing_math_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)
                                               & (school_data_complete_df["math_score"] >= 70)]

# Calculate the number of students passing math and passing reading by school.
per_passing_math_reading = per_passing_math_reading.groupby(["school_name"]).count()["student_name"]

# Calculate the percentage of passing math and reading scores per school.
per_overall_passing_percentage = per_passing_math_reading / per_school_counts * 100

In [39]:
# Create the DataFrame
per_school_summary_df = pd.DataFrame({
    "School Type": per_school_types,
    "Total Students": per_school_counts,
    "Total School Budget": per_school_budget,
    "Per Student Budget": per_school_capita,
    "Average Math Score": per_school_math,
    "Average Reading Score": per_school_reading,
    "% Passing Math": per_school_passing_math,
    "% Passing Reading": per_school_passing_reading,
    "% Overall Passing": per_overall_passing_percentage})


# per_school_summary_df.head()

In [40]:
# Format the Total School Budget and the Per Student Budget
per_school_summary_df["Total School Budget"] = per_school_summary_df["Total School Budget"].map("${:,.2f}".format)
per_school_summary_df["Per Student Budget"] = per_school_summary_df["Per Student Budget"].map("${:,.2f}".format)

# Display the data frame
per_school_summary_df

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.680064,81.93328,54.642283
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,91.334769
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,53.204476
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602,79.299014,54.289887
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,90.599455
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.752967,80.862999,53.527508
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.505855,96.252927,89.227166
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,53.513884
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551,81.222432,53.539172
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,90.540541


In [41]:
# Step 5.  Get the number of 10th-12th graders from Thomas High School (THS).
school_data_complete_df.loc[(school_data_complete_df["school_name"] == "Thomas High School") & (school_data_complete_df["grade"] == "9th"),
                            ["math_score"]] = np.nan



ths_loc= school_data_complete_df.loc[(school_data_complete_df["school_name"] == "Thomas High School") & 
                                     (school_data_complete_df["grade"] != "9th")]
                            


ths_loc.head()
ths_loc_df= pd.DataFrame(ths_loc)
ths_loc_group_df= ths_loc.groupby(['grade'])
ths_loc_group_df.count()

Unnamed: 0_level_0,Student ID,student_name,gender,school_name,reading_score,math_score,School ID,type,size,budget
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10th,421,421,421,421,421,421,421,421,421,421
11th,415,415,415,415,415,415,415,415,415,415
12th,338,338,338,338,338,338,338,338,338,338


In [42]:
#Count of THS 10-12th graders
ths_student_count = ths_loc_df["student_name"].count()
ths_student_count

1174

In [43]:
# Step 6. Get all the students passing math from THS
ths_passing_math = ths_loc_df[ths_loc_df["math_score"] >= 70]
ths_passing_math.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
37535,37535,Norma Mata,F,10th,Thomas High School,76.0,76.0,14,Charter,1635,1043130
37536,37536,Cody Miller,M,11th,Thomas High School,84.0,82.0,14,Charter,1635,1043130
37541,37541,Eric Stevens,M,10th,Thomas High School,80.0,76.0,14,Charter,1635,1043130
37542,37542,Elizabeth Bennett,F,11th,Thomas High School,91.0,94.0,14,Charter,1635,1043130
37544,37544,Jacqueline Harris,F,10th,Thomas High School,71.0,92.0,14,Charter,1635,1043130


In [44]:
# Step 7. Get all the students passing reading from THS
ths_passing_reading = ths_loc_df[ths_loc_df["reading_score"] >= 70]
ths_passing_reading.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
37535,37535,Norma Mata,F,10th,Thomas High School,76.0,76.0,14,Charter,1635,1043130
37536,37536,Cody Miller,M,11th,Thomas High School,84.0,82.0,14,Charter,1635,1043130
37541,37541,Eric Stevens,M,10th,Thomas High School,80.0,76.0,14,Charter,1635,1043130
37542,37542,Elizabeth Bennett,F,11th,Thomas High School,91.0,94.0,14,Charter,1635,1043130
37544,37544,Jacqueline Harris,F,10th,Thomas High School,71.0,92.0,14,Charter,1635,1043130


In [45]:
# Step 8. Get all the students passing math and reading from THS
ths_passing_math_reading = ths_loc_df[(ths_loc_df["math_score"] >= 70) & (ths_loc_df["reading_score"] >= 70)]

ths_passing_math_reading.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
37535,37535,Norma Mata,F,10th,Thomas High School,76.0,76.0,14,Charter,1635,1043130
37536,37536,Cody Miller,M,11th,Thomas High School,84.0,82.0,14,Charter,1635,1043130
37541,37541,Eric Stevens,M,10th,Thomas High School,80.0,76.0,14,Charter,1635,1043130
37542,37542,Elizabeth Bennett,F,11th,Thomas High School,91.0,94.0,14,Charter,1635,1043130
37544,37544,Jacqueline Harris,F,10th,Thomas High School,71.0,92.0,14,Charter,1635,1043130


In [46]:
# Step 9. Calculate the percentage of 10th-12th grade students passing math from Thomas High School. 
# Calculate the number of students passing math.
ths_passing_math_count = ths_passing_math["student_name"].count()
ths_passing_math_percentage = ths_passing_math_count / float(ths_student_count) * 100
print(ths_passing_math_percentage)

93.18568994889267


In [47]:
# Step 10. Calculate the percentage of 10th-12th grade students passing reading from Thomas High School.
# Calculate the number of students passing reading.
ths_passing_reading_count = ths_passing_reading["student_name"].count()
ths_passing_reading_percentage = ths_passing_reading_count / float(ths_student_count) * 100
print(ths_passing_reading_percentage)

97.01873935264055


In [48]:
# Calculate the number of students who passed both math and reading.
ths_overall_passing_math_reading_count = ths_passing_math_reading["student_name"].count()
ths_overall_passing_math_reading_count


1064

In [49]:
# Step 11. Calculate the overall passing percentage of 10th-12th grade from Thomas High School. 
ths_overall_passing_percentage = ths_overall_passing_math_reading_count / ths_student_count * 100
ths_overall_passing_percentage


90.63032367972743

In [50]:
# Step 12. Replace the passing math percent for Thomas High School in the per_school_summary_df.
per_school_summary_df.loc["Thomas High School", "% Passing Math"] = '93.1856'

per_school_summary_df


#school_data_complete_df.loc[(school_data_complete_df["school_name"] == "Thomas High School") 

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.6801,81.93328,54.642283
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.1335,97.039828,91.334769
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.9885,80.739234,53.204476
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.3096,79.299014,54.289887
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.3924,97.138965,90.599455
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.753,80.862999,53.527508
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.5059,96.252927,89.227166
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.6839,81.316421,53.513884
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.0576,81.222432,53.539172
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.5946,95.945946,90.540541


In [51]:
# Step 13. Replace the passing reading percentage for Thomas High School in the per_school_summary_df.
per_school_summary_df.loc["Thomas High School", "% Passing Reading"] = '97.0187'
per_school_summary_df

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.6801,81.9333,54.642283
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.1335,97.0398,91.334769
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.9885,80.7392,53.204476
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.3096,79.299,54.289887
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.3924,97.139,90.599455
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.753,80.863,53.527508
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.5059,96.2529,89.227166
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.6839,81.3164,53.513884
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.0576,81.2224,53.539172
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.5946,95.9459,90.540541


In [52]:
# Step 14. Replace the overall passing percentage for Thomas High School in the per_school_summary_df.
per_school_summary_df.loc["Thomas High School", "% Overall Passing"] = '90.6303'
per_school_summary_df

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.6801,81.9333,54.6423
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.1335,97.0398,91.3348
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.9885,80.7392,53.2045
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.3096,79.299,54.2899
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.3924,97.139,90.5995
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.753,80.863,53.5275
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.5059,96.2529,89.2272
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.6839,81.3164,53.5139
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.0576,81.2224,53.5392
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.5946,95.9459,90.5405


In [53]:
# per_school_summary_df
# Create the DataFrame
per_school_summary_df = pd.DataFrame({
    "School Type": per_school_types,
    "Total Students": per_school_counts,
    "Total School Budget": per_school_budget,
    "Per Student Budget": per_school_capita,
    "Average Math Score": per_school_math,
    "Average Reading Score": per_school_reading,
    "% Passing Math": per_school_passing_math,
    "% Passing Reading": per_school_passing_reading,
    "% Overall Passing": per_overall_passing_percentage})
# Format the Total School Budget and the Per Student Budget
per_school_summary_df["Total School Budget"] = per_school_summary_df["Total School Budget"].map("${:,.2f}".format)
per_school_summary_df["Per Student Budget"] = per_school_summary_df["Per Student Budget"].map("${:,.2f}".format)

# Display the data frame
per_school_summary_df

# per_school_summary_df.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.680064,81.93328,54.642283
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,91.334769
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,53.204476
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602,79.299014,54.289887
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,90.599455
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.752967,80.862999,53.527508
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.505855,96.252927,89.227166
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,53.513884
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551,81.222432,53.539172
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,90.540541


## High and Low Performing Schools 

In [35]:
# Sort and show top five schools.
per_school_summary_df.sort_values(by="% Overall Passing", ascending=False).head(5)

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,91.334769
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,90.599455
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274201,83.989488,93.867718,96.539641,90.582567
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,90.540541
Wright High School,Charter,1800,"$1,049,400.00",$583.00,83.682222,83.955,93.333333,96.611111,90.333333


In [36]:
# Sort and show bottom five schools.
per_school_summary_df.sort_values(by="% Overall Passing", ascending=False).tail(5)

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551,81.222432,53.539172
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.752967,80.862999,53.527508
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,53.513884
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,53.204476
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.842711,80.744686,66.366592,80.220055,52.988247


## Math and Reading Scores by Grade

In [46]:
# Create a Series of scores by grade levels using conditionals.
grades_schools =  school_data_complete_df[["school_name", "grade", "reading_score", "math_score"]]

school_grades_group= grades_schools.groupby(["school_name", "grade"]).mean()

# Group each school Series by the school name for the average math score.


# Group each school Series by the school name for the average reading score.


In [47]:
# Combine each Series for average math scores by school into single data frame.
school_grades_group_df=pd.DataFrame(school_grades_group)

In [52]:
# Combine each Series for average reading scores by school into single data frame.
school_grades_group_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reading_score,math_score
school_name,grade,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,10th,80.9,77.0
Bailey High School,11th,80.9,77.5
Bailey High School,12th,80.9,76.5
Bailey High School,9th,81.3,77.1
Cabrera High School,10th,84.3,83.2


In [54]:
# Remove the index.

school_grades_group_df
# Display the data frame


Unnamed: 0_level_0,Unnamed: 1_level_0,reading_score,math_score
school_name,grade,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,10th,80.9,77.0
Bailey High School,11th,80.9,77.5
Bailey High School,12th,80.9,76.5
Bailey High School,9th,81.3,77.1
Cabrera High School,10th,84.3,83.2
Cabrera High School,11th,83.8,82.8
Cabrera High School,12th,84.3,83.3
Cabrera High School,9th,83.7,83.1
Figueroa High School,10th,81.4,76.5
Figueroa High School,11th,80.6,76.9


In [36]:
## Remove the index.


# Display the data frame


## Scores by School Spending

In [96]:
# Establish the spending bins and group names.
#per_school_summary_df.sort_values(by="Total School Budget", ascending=False)
#print(per_school_summary_df["Total School Budget"].max())
#print(per_school_summary_df["Total School Budget"].min())
per_school_summary_df.dtypes.value_counts()
#per_school_summary_df["Total School Budget"].astype('str')
#https://pbpython.com/currency-cleanup.html
def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace('$', '').replace(',', ''))
    return(x)

per_school_summary_df['Total School Budget'] = per_school_summary_df['Total School Budget'].apply(clean_currency).astype('float')
per_school_summary_df['Total School Budget_type'] = per_school_summary_df['Total School Budget'].apply(lambda x: type(x).__name__)
per_school_summary_df['Total School Budget'].dtypes
# Create bins in which to place values based upon TED Talk views
bins = [0,499999, 999999,1999999,2999999, 4000000]
# Create labels for these bins
group_labels = ["0 to 499k", "500k to 999k","1mil to 2mil", "2mil to 3mil", "3mil to 4mil"]


# Categorize spending based on the bins.
pd.cut(per_school_summary_df["Total School Budget"], bins=bins, labels=group_labels).head()
per_school_summary_df["View Group"] = pd.cut(per_school_summary_df["Total School Budget"], bins, labels=group_labels)
per_school_summary_df

Bailey High School      3mil to 4mil
Cabrera High School     1mil to 2mil
Figueroa High School    1mil to 2mil
Ford High School        1mil to 2mil
Griffin High School     500k to 999k
Name: Total School Budget, dtype: category
Categories (5, object): ['0 to 499k' < '500k to 999k' < '1mil to 2mil' < '2mil to 3mil' < '3mil to 4mil']

In [106]:
# Calculate averages for the desired columns. 

total_budget_bins= per_school_summary_df.groupby("View Group")
print(total_budget_bins["Average Math Score"].count())
total_budget_bins[["Average Math Score", "Average Reading Score"]].mean()

View Group
0 to 499k       1
500k to 999k    2
1mil to 2mil    8
2mil to 3mil    1
3mil to 4mil    3
Name: Average Math Score, dtype: int64


Unnamed: 0_level_0,Average Math Score,Average Reading Score
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1
0 to 499k,83.803279,83.814988
500k to 999k,83.595708,83.930728
1mil to 2mil,80.89656,82.828634
2mil to 3mil,76.842711,80.744686
3mil to 4mil,77.136883,80.978256


In [129]:
# Create the DataFrame
total_budget_bins_averages=total_budget_bins.mean()
total_budget_bins_averages

Unnamed: 0_level_0,Total Students,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0 to 499k,427.0,248087.0,83.803279,83.814988,92.505855,96.252927,89.227166
500k to 999k,1215.0,751679.0,83.595708,83.930728,93.993483,96.542455,90.569998
1mil to 2mil,2242.75,1388628.0,80.89656,82.828634,80.26187,87.132936,73.528434
2mil to 3mil,3999.0,2547363.0,76.842711,80.744686,66.366592,80.220055,52.988247
3mil to 4mil,4790.666667,3080533.0,77.136883,80.978256,66.496861,81.33957,53.902988


In [130]:
# Format the DataFrame 
total_budget_bins_averages_df=pd.DataFrame(total_budget_bins_averages)
total_budget_bins_averages_df


total_budget_bins_averages_df["Total Students"] = total_budget_bins_averages_df["Total Students"].map("{:,.1f}".format)
# Format the "Total Budget" to have the comma for a thousands separator, a decimal separator and a "$".
total_budget_bins_averages_df["Total School Budget"] = total_budget_bins_averages_df["Total School Budget"].map("${:,.2f}".format)
# Format the columns.
total_budget_bins_averages_df["Average Math Score"] = total_budget_bins_averages_df["Average Math Score"].map("{:.1f}".format)
total_budget_bins_averages_df["Average Reading Score"] = total_budget_bins_averages_df["Average Reading Score"].map("{:.1f}".format)
total_budget_bins_averages_df["% Passing Math"] = total_budget_bins_averages_df["% Passing Math"].map("{:.1f}".format)
total_budget_bins_averages_df["% Passing Reading"] = total_budget_bins_averages_df["% Passing Reading"].map("{:.1f}".format)
total_budget_bins_averages_df["% Overall Passing"] = total_budget_bins_averages_df["% Overall Passing"].map("{:.1f}".format)

total_budget_bins_averages_df

Unnamed: 0_level_0,Total Students,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0 to 499k,427.0,"$248,087.00",83.8,83.8,92.5,96.3,89.2
500k to 999k,1215.0,"$751,679.00",83.6,83.9,94.0,96.5,90.6
1mil to 2mil,2242.8,"$1,388,627.75",80.9,82.8,80.3,87.1,73.5
2mil to 3mil,3999.0,"$2,547,363.00",76.8,80.7,66.4,80.2,53.0
3mil to 4mil,4790.7,"$3,080,532.67",77.1,81.0,66.5,81.3,53.9


## Scores by School Size

In [133]:
# Establish the bins.
print(per_school_summary_df["Total Students"].max())
print(per_school_summary_df["Total Students"].min())

# Create bins in which to place values based upon TED Talk views
bins = [0,499, 999,1999,2999, 3999, 5000]
# Create labels for these bins
group_labels = ["0 to 499", "500 to 999k","1k to 2k", "2k to 3k", "3k to 4k", "4k to 5k"]


# Categorize spending based on the bins.
pd.cut(per_school_summary_df["Total Students"], bins=bins, labels=group_labels).head()
per_school_summary_df["View Group"] = pd.cut(per_school_summary_df["Total Students"], bins, labels=group_labels)
per_school_summary_df
# Categorize spending based on the bins.


4976
427


Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing,Total School Budget_type,View Group
Bailey High School,District,4976,3124928.0,$628.00,77.048432,81.033963,66.680064,81.93328,54.642283,float,4k to 5k
Cabrera High School,Charter,1858,1081356.0,$582.00,83.061895,83.97578,94.133477,97.039828,91.334769,float,1k to 2k
Figueroa High School,District,2949,1884411.0,$639.00,76.711767,81.15802,65.988471,80.739234,53.204476,float,2k to 3k
Ford High School,District,2739,1763916.0,$644.00,77.102592,80.746258,68.309602,79.299014,54.289887,float,2k to 3k
Griffin High School,Charter,1468,917500.0,$625.00,83.351499,83.816757,93.392371,97.138965,90.599455,float,1k to 2k
Hernandez High School,District,4635,3022020.0,$652.00,77.289752,80.934412,66.752967,80.862999,53.527508,float,4k to 5k
Holden High School,Charter,427,248087.0,$581.00,83.803279,83.814988,92.505855,96.252927,89.227166,float,0 to 499
Huang High School,District,2917,1910635.0,$655.00,76.629414,81.182722,65.683922,81.316421,53.513884,float,2k to 3k
Johnson High School,District,4761,3094650.0,$650.00,77.072464,80.966394,66.057551,81.222432,53.539172,float,4k to 5k
Pena High School,Charter,962,585858.0,$609.00,83.839917,84.044699,94.594595,95.945946,90.540541,float,500 to 999k


In [135]:
# Calculate averages for the desired columns. 
# Calculate averages for the desired columns. 

total_size_bins= per_school_summary_df.groupby("View Group")
print(total_size_bins["Average Math Score"].count())
total_size_bins[["Total School Budget", "Average Math Score", "Average Reading Score"]].mean()

View Group
0 to 499       1
500 to 999k    1
1k to 2k       5
2k to 3k       4
3k to 4k       1
4k to 5k       3
Name: Average Math Score, dtype: int64


Unnamed: 0_level_0,Total School Budget,Average Math Score,Average Reading Score
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0 to 499,248087.0,83.803279,83.814988
500 to 999k,585858.0,83.839917,84.044699
1k to 2k,1029597.0,83.361201,83.873869
2k to 3k,1719634.0,78.429493,81.769122
3k to 4k,2547363.0,76.842711,80.744686
4k to 5k,3080533.0,77.136883,80.978256


In [137]:
# Assemble into DataFrame. 
total_size_bins_averages=total_size_bins.mean()
total_size_bins_averages

Unnamed: 0_level_0,Total Students,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0 to 499,427.0,248087.0,83.803279,83.814988,92.505855,96.252927,89.227166
500 to 999k,962.0,585858.0,83.839917,84.044699,94.594595,95.945946,90.540541
1k to 2k,1704.4,1029597.0,83.361201,83.873869,88.327523,91.261628,85.447223
2k to 3k,2722.0,1719634.0,78.429493,81.769122,73.462428,84.473577,62.897703
3k to 4k,3999.0,2547363.0,76.842711,80.744686,66.366592,80.220055,52.988247
4k to 5k,4790.666667,3080533.0,77.136883,80.978256,66.496861,81.33957,53.902988


In [138]:
# Format the DataFrame  
# Format the DataFrame 
total_size_bins_averages_df=pd.DataFrame(total_size_bins_averages)
total_size_bins_averages_df


total_size_bins_averages_df["Total Students"] = total_size_bins_averages_df["Total Students"].map("{:,.1f}".format)
# Format the "Total Budget" to have the comma for a thousands separator, a decimal separator and a "$".
total_size_bins_averages_df["Total School Budget"] = total_size_bins_averages_df["Total School Budget"].map("${:,.2f}".format)
# Format the columns.
total_size_bins_averages_df["Average Math Score"] = total_size_bins_averages_df["Average Math Score"].map("{:.1f}".format)
total_size_bins_averages_df["Average Reading Score"] = total_size_bins_averages_df["Average Reading Score"].map("{:.1f}".format)
total_size_bins_averages_df["% Passing Math"] = total_size_bins_averages_df["% Passing Math"].map("{:.1f}".format)
total_size_bins_averages_df["% Passing Reading"] = total_size_bins_averages_df["% Passing Reading"].map("{:.1f}".format)
total_size_bins_averages_df["% Overall Passing"] = total_size_bins_averages_df["% Overall Passing"].map("{:.1f}".format)

total_size_bins_averages_df


Unnamed: 0_level_0,Total Students,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0 to 499,427.0,"$248,087.00",83.8,83.8,92.5,96.3,89.2
500 to 999k,962.0,"$585,858.00",83.8,84.0,94.6,95.9,90.5
1k to 2k,1704.4,"$1,029,597.20",83.4,83.9,88.3,91.3,85.4
2k to 3k,2722.0,"$1,719,634.00",78.4,81.8,73.5,84.5,62.9
3k to 4k,3999.0,"$2,547,363.00",76.8,80.7,66.4,80.2,53.0
4k to 5k,4790.7,"$3,080,532.67",77.1,81.0,66.5,81.3,53.9


## Scores by School Type

In [140]:
# Calculate averages for the desired columns. 


school_type= per_school_summary_df.groupby(["School Type"]).mean()
school_type

Unnamed: 0_level_0,Total Students,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Charter,1524.25,912688.1,83.465425,83.902315,90.325723,93.130832,87.198299
District,3853.714286,2478275.0,76.956733,80.966636,66.548453,80.799062,53.672208


In [141]:
# Assemble into DataFrame. 
school_type_df=pd.DataFrame(school_type)


In [142]:
# # Format the DataFrame 
school_type_df["Total Students"] = school_type_df["Total Students"].map("{:,.1f}".format)
# Format the "Total Budget" to have the comma for a thousands separator, a decimal separator and a "$".
school_type_df["Total School Budget"] = school_type_df["Total School Budget"].map("${:,.2f}".format)
# Format the columns.
school_type_df["Average Math Score"] = school_type_df["Average Math Score"].map("{:.1f}".format)
school_type_df["Average Reading Score"] = school_type_df["Average Reading Score"].map("{:.1f}".format)
school_type_df["% Passing Math"] = school_type_df["% Passing Math"].map("{:.1f}".format)
school_type_df["% Passing Reading"] = school_type_df["% Passing Reading"].map("{:.1f}".format)
school_type_df["% Overall Passing"] = school_type_df["% Overall Passing"].map("{:.1f}".format)

school_type_df

Unnamed: 0_level_0,Total Students,Total School Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Charter,1524.2,"$912,688.12",83.5,83.9,90.3,93.1,87.2
District,3853.7,"$2,478,274.71",77.0,81.0,66.5,80.8,53.7


In [None]:
# python modules and packages https://www.pythonlikeyoumeanit.com/Module5_OddsAndEnds/Modules_and_Packages.html
# reading csv files into pandas dataframe https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
# .head() https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html
# .tail() https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isnull.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.notnull.html
#loc https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.loc.html
# merging https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html
# merge, join https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
#unique https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.unique.html
#format examples https://docs.python.org/3.4/library/string.html#format-examples
#format spec https://docs.python.org/3.4/library/string.html#format-specification-mini-language
# indexing https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.set_index.html
# count https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.value_counts.html
# groupby https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html
# sort values https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
# string splitting https://docs.python.org/3/library/stdtypes.html#string-methods
# sort and head https://stackoverflow.com/questions/49632059/pandas-sort-values-to-get-top-5-for-each-column-in-a-groupby