In [1]:
# Add the Pandas dependency.
import pandas as pd

In [2]:
# Files to load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

In [20]:
# Read the school data file and store it in a Pandas DataFrame.
school_data_df = pd.read_csv(school_data_to_load)



In [21]:
# Read the student data file and store it in a Pandas DataFrame.
student_data_df = pd.read_csv(student_data_to_load)



In [5]:
# Determine if there are any missing values in the school data.
school_data_df.count()

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [6]:
school_data_df.notnull().sum()

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [22]:
# Determine data types for the school DataFrame
school_data_df.dtypes

School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype: object

In [23]:
# Determine data types for the student DataFrame.
student_data_df["grade"].dtypes

dtype('O')

In [9]:
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [24]:
# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")
    

In [11]:
# Combine the data into a single dataset.
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])
school_data_complete_df["budget"].sum()

82932329558

In [12]:
school_count = len(school_data_complete_df["school_name"].unique())
school_count

15

In [13]:
student_count = school_data_complete_df["Student ID"].count()
student_count

39170

In [14]:
# Calculate the total budget.
total_budget = school_data_df["budget"].sum()
total_budget

24649428

In [15]:
average_math_score = school_data_complete_df["math_score"].mean()
average_math_score

78.98537145774827

In [16]:
average_reading_score = school_data_complete_df["reading_score"].mean()
average_reading_score

81.87784018381414

In [17]:
passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70
passing_math

0         True
1        False
2        False
3        False
4         True
         ...  
39165     True
39166     True
39167     True
39168     True
39169     True
Name: math_score, Length: 39170, dtype: bool

In [25]:
# Get all the students who are passing math in a new DataFrame.
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
passing_math
# Calculate the number of students passing math.
passing_math_count = passing_math["student_name"].count()
print(passing_math_count)

# Get all the students who are passing reading in a new DataFrame.
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]
passing_math

# Calculate the number of students passing reading.
passing_reading_count = passing_reading["student_name"].count()
print(passing_reading_count)

29370
33610


In [26]:
# Calculate the percent that passed math.
passing_math_percentage = passing_math_count / float(student_count) * 100
print(passing_math_percentage)

# Calculate the percent that passed reading.
passing_reading_percentage = passing_reading_count / float(student_count) * 100
print(passing_reading_percentage)

74.9808526933878
85.80546336482001


In [27]:
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]
# Calculate the number of students who passed both math and reading.
overall_passing_math_reading_count = passing_math_reading["student_name"].count()
overall_passing_math_reading_count

# Calculate the overall passing percentage.
overall_passing_percentage = overall_passing_math_reading_count / student_count * 100
overall_passing_percentage

65.17232575950983

In [28]:
# Adding a list of values with keys to create a new DataFrame.
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math_score,
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [29]:
# Define the function "say_hello" so it prints "Hello!" when called.
def say_hello(string):
    print(string)

pppde = "The quick brown fox jumped over the lazy dog."
say_hello(pppde)

The quick brown fox jumped over the lazy dog.


In [30]:
# Define a function that calculates the percentage of students that passed both # math and reading and prints the passing percentage to the output when the
# function is called.
def passing_math_percent(pass_math_count, student_count):
    return pass_math_count / float(student_count) * 100

passing_math_count = 29370
total_student_count = 39170

# Call the function.
passing_math_percent(passing_math_count, total_student_count)


74.9808526933878

In [31]:
# A list of my grades.
my_grades = ['B', 'C', 'B' , 'D']

# Import pandas.
import pandas as pd
# Convert the my_grades to a Series
my_grades = pd.Series(my_grades)
my_grades



0    B
1    C
2    B
3    D
dtype: object

In [32]:
# Change the grades by one letter grade.
my_grades.map({'B': 'A', 'C': 'B', 'D': 'C'})

0    A
1    B
2    A
3    C
dtype: object

In [33]:
# Using the format() function.
my_grades = [92.34, 84.56, 86.78, 98.32]

for grade in my_grades:
    print("{:.1f}".format(grade))

92.3
84.6
86.8
98.3


In [34]:
# Convert the numerical grades to a Series.
my_grades = pd.Series([92.34, 84.56, 86.78, 78.32])
my_grades

# Format the grades to the nearest whole number percent.
my_grades.map("{:.0f}".format)

0    92
1    85
2    87
3    78
dtype: object

In [35]:
# Format the "Total Students" to have the comma for a thousands separator.
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)

district_summary_df["Total Students"]

0    39,170
Name: Total Students, dtype: object

In [58]:
Format "Total Budget" to have the comma for a thousands separator, a decimal separator, and a "$".

district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)

district_summary_df["Total Budget"]

SyntaxError: invalid syntax (<ipython-input-58-685b00b2fc17>, line 1)

In [57]:
# district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)

# district_summary_df["Total Budget"]

# Format the columns.
#district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)

# district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)

# district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)

# district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)

# district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

ValueError: Unknown format code 'f' for object of type 'str'

In [48]:
district_summary_df

Unnamed: 0,Total Schools,Total Schools.1,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,15,39170,"$24,649,428.00",79.0,81.9,75,86,65


In [50]:
# Reorder the columns in the order you want them to appear.
new_column_order = ["Total Schools", "Total Students", "Total Budget","Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing"]

# Assign district summary df the new column order.
district_summary_df = district_summary_df[new_column_order]
district_summary_df

Unnamed: 0,Total Schools,Total Schools.1,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,15,39170,"$24,649,428.00",79.0,81.9,75,86,65
