### Student Performance in Exams Analysis

In [1]:
import os
import sys
import pprint
from pathlib import Path
from matplotlib import pyplot as plt

# Define project root dynamically, gets the current directory from whick the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *

# Build route to CSV and upload
csv_path = project_root / "Data" / "Raw" / "StudentsPerformance.csv"
df_students = load_csv(str(csv_path))

In [2]:
df_students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [3]:
df_students.sample(25)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
658,female,group D,associate's degree,free/reduced,none,43,60,58
754,male,group C,associate's degree,free/reduced,none,58,55,53
503,female,group E,associate's degree,standard,completed,95,89,92
289,male,group E,some high school,standard,completed,77,76,77
744,male,group B,some college,free/reduced,none,55,55,47
981,male,group D,some high school,standard,none,81,78,78
942,male,group C,high school,standard,none,81,66,64
361,male,group B,some high school,standard,completed,85,84,78
757,male,group E,bachelor's degree,free/reduced,completed,70,68,72
404,female,group C,high school,standard,none,54,59,62


In [4]:
df_students = normalize_columns_header(df_students)
print(df_students.columns)

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')


In [5]:
df_students = normalize_string(df_students)
df_students.sample(25)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
529,female,group_c,associate's_degree,standard,none,62,74,70
742,female,group_c,high_school,standard,none,81,84,82
254,male,group_d,high_school,standard,none,54,52,52
205,male,group_d,some_high_school,standard,completed,74,71,78
489,male,group_a,associate's_degree,free_reduced,completed,79,82,82
309,female,group_d,high_school,free_reduced,none,49,57,52
449,male,group_b,associate's_degree,standard,none,81,73,72
971,male,group_c,some_high_school,standard,completed,78,72,69
460,male,group_c,bachelor's_degree,free_reduced,none,53,58,55
468,female,group_a,high_school,free_reduced,completed,77,88,85


In [6]:
print(f"Number of explicit duplicates: ",df_students.duplicated().sum())


Number of explicit duplicates:  0


In [7]:
print(detect_implicit_duplicates(df_students, include=None, exclude=['math score', 'reading score', 'writing score']))


Column: 'gender'


Searching implicit values for: 'gender': 100%|██████████| 2/2 [00:00<00:00, 1692.62it/s]


  'male' → ['female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female', 'female'

Searching implicit values for: 'lunch': 100%|██████████| 1/1 [00:00<00:00, 4510.00it/s]



Column: 'test_preparation_course'


Searching implicit values for: 'test_preparation_course': 100%|██████████| 2/2 [00:00<00:00, 2705.13it/s]

None





In [8]:
print("Number of missing values: \n", df_students.isna().sum())

Number of missing values: 
 gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64


#### Exploratory Data Analysis

In [9]:
print(df_students.describe())

       math_score  reading_score  writing_score
count  1000.00000    1000.000000    1000.000000
mean     66.08900      69.169000      68.054000
std      15.16308      14.600192      15.195657
min       0.00000      17.000000      10.000000
25%      57.00000      59.000000      57.750000
50%      66.00000      70.000000      69.000000
75%      77.00000      79.000000      79.000000
max     100.00000     100.000000     100.000000


What does the standard deviation (std) measure?

The std measures how much the data vary from the mean.

- If the std is small, the data are highly concentrated near the mean.
- If the std is large, the data are highly dispersed and more disordered.

### Scores

In [19]:
def score_classification(score):
    if score <= 10:
        return '0-9'
    elif 9 < score <= 19:
        return '10-19'
    elif 19 < score <= 29:
        return '20-29'
    elif 29 < score <= 39:
        return '30-39'
    elif 39 < score <= 49:
        return '40-49'
    elif 49 < score <= 59:
        return '50-59'
    elif 59 < score <= 69:
        return '60-69'
    elif 69 < score <= 79:
        return '70-79'
    elif 79 < score <= 89:
        return '80-89'
    elif 89 < score <= 99:
        return '90-99'
    else:
        return '100'

print("*** Overall scores ***")
print()
df_students['overall_scores'] = (df_students['math_score'] + df_students['reading_score'] + df_students['writing_score'])/3
print(df_students['overall_scores'].describe())
print()
print("Overall students with zero score: ", df_students.loc[df_students['overall_scores'] == 0].shape[0])
print("Overall students with 100 score: ", df_students.loc[df_students['overall_scores'] == 100].shape[0])
print()
df_students['score_classification'] = df_students['overall_scores'].apply(score_classification)
print("Overall score classification: \n", df_students.groupby(['score_classification'])['overall_scores'].count())
print()
print("Overall lunch: \n", df_students.groupby(['lunch'])['overall_scores'].count())
print()
print("Overall prep: \n", df_students.groupby(['test_preparation_course'])['overall_scores'].count())
print()
print(df_students.groupby(['race_ethnicity', 'parental_level_of_education'])['overall_scores'].count())
print()
print("Gender Students by ethnicity ", df_students.groupby('race_ethnicity')['gender'].value_counts())
print()
print("Overall students with 60-69 score and standard lunch: ", df_students.loc[(df_students['score_classification'] == '60-69') & (df_students['lunch'] == 'standard')].shape[0])
print("Overall students with 60-69 score and free/reduced lunch: ", df_students.loc[(df_students['score_classification'] == '60-69') & (df_students['lunch'] == 'free/reduced')].shape[0])
print()
print("Overall students with 70-79 score and standard lunch: ", df_students.loc[(df_students['score_classification'] == '70-79') & (df_students['lunch'] == 'standard')].shape[0])
print("Overall students with 70-79 score and free/reduced lunch: ", df_students.loc[(df_students['score_classification'] == '70-79') & (df_students['lunch'] == 'free/reduced')].shape[0])
print()
print("Overall students with 80-89 score and standard lunch: ", df_students.loc[(df_students['score_classification'] == '80-89') & (df_students['lunch'] == 'standard')].shape[0])
print("Overall students with 80-89 score and free/reduced lunch: ", df_students.loc[(df_students['score_classification'] == '80-89') & (df_students['lunch'] == 'free/reduced')].shape[0])
print()
print("Overall students with 90-99 score and standard lunch: ", df_students.loc[(df_students['score_classification'] == '90-99') & (df_students['lunch'] == 'standard')].shape[0])
print("Overall students with 90-99 score and free/reduced lunch: ", df_students.loc[(df_students['score_classification'] == '90-99') & (df_students['lunch'] == 'free/reduced')].shape[0])
print()
print("Overall students with 60-69 score and none prep: ", df_students.loc[(df_students['score_classification'] == '60-69') & (df_students['test_preparation_course'] == 'none')].shape[0])
print("Overall students with 60-69 score and completed prep: ", df_students.loc[(df_students['score_classification'] == '60-69') & (df_students['test_preparation_course'] == 'completed')].shape[0])
print()
print("Overall students with 70-79 score and none prep: ", df_students.loc[(df_students['score_classification'] == '70-79') & (df_students['test_preparation_course'] == 'none')].shape[0])
print("Overall students with 70-79 score and completed prep: ", df_students.loc[(df_students['score_classification'] == '70-79') & (df_students['test_preparation_course'] == 'completed')].shape[0])
print()
print("Overall students with 80-89 score and none prep: ", df_students.loc[(df_students['score_classification'] == '80-89') & (df_students['test_preparation_course'] == 'none')].shape[0])
print("Overall students with 80-89 score and completed prep: ", df_students.loc[(df_students['score_classification'] == '80-89') & (df_students['test_preparation_course'] == 'completed')].shape[0])
print()
print("Overall students with 90-99 score and none prep: ", df_students.loc[(df_students['score_classification'] == '90-99') & (df_students['test_preparation_course'] == 'none')].shape[0])
print("Overall students with 90-99 score and completed prep: ", df_students.loc[(df_students['score_classification'] == '90-99') & (df_students['test_preparation_course'] == 'completed')].shape[0])
print()
print("Overall students with 100 score and none prep: ", df_students.loc[(df_students['score_classification'] == '100') & (df_students['test_preparation_course'] == 'none')].shape[0])
print("Overall students with 100 score and completed prep: ", df_students.loc[(df_students['score_classification'] == '100') & (df_students['test_preparation_course'] == 'completed')].shape[0])


*** Overall scores ***

count    1000.000000
mean       67.770667
std        14.257326
min         9.000000
25%        58.333333
50%        68.333333
75%        77.666667
max       100.000000
Name: overall_scores, dtype: float64

Overall students with zero score:  0
Overall students with 100 score:  3

Overall score classification: 
 score_classification
0-9        1
10-19      1
100        4
20-29      4
30-39     20
40-49     70
50-59    178
60-69    252
70-79    260
80-89    156
90-99     54
Name: overall_scores, dtype: int64

Overall lunch: 
 lunch
free_reduced    355
standard        645
Name: overall_scores, dtype: int64

Overall prep: 
 test_preparation_course
completed    358
none         642
Name: overall_scores, dtype: int64

race_ethnicity  parental_level_of_education
group_a         associate's_degree             14
                bachelor's_degree              12
                high_school                    18
                master's_degree                 3
           

In [None]:
math_mean = df_students['math_score'].mean()
math_std = df_students['math_score'].std()

reading_mean = df_students['reading_score'].mean()
reading_std = df_students['reading_score'].std()

writing_mean = df_students['writing_score'].mean()
writing_std = df_students['writing_score'].std()


math_lower = math_mean - math_std # std lower than mean
math_upper = math_mean + math_std # std upper than mean

reading_lower = reading_mean - reading_std # std lower than mean
reading_upper = reading_mean + reading_std # std upper than mean

writing_lower = writing_mean - writing_std # std lower than mean
writing_upper = writing_mean + writing_std # std upper than mean


df_math_typical = df_students[(df_students['math_score'] >= math_lower) & (df_students['math_score'] <= math_upper)]
df_math_high = df_students[df_students['math_score'] > math_upper]
df_math_low = df_students[df_students['math_score'] < math_lower]

df_reading_typical = df_students[(df_students['reading_score'] >= reading_lower) & (df_students['reading_score'] <= reading_upper)]
df_reading_high = df_students[df_students['reading_score'] > reading_upper]
df_reading_low = df_students[df_students['reading_score'] < reading_lower]

df_writing_typical = df_students[(df_students['writing_score'] >= writing_lower) & (df_students['writing_score'] <= writing_upper)]
df_writing_high = df_students[df_students['writing_score'] > writing_upper]
df_writing_low = df_students[df_students['writing_score'] < writing_lower]

print("*** MATH ***: \n", df_math_low['math_score'].describe())
print()
print(df_math_typical['math_score'].describe())
print()
print(df_math_high['math_score'].describe())
print()
print()
print("*** READING ***: \n", df_reading_low['reading_score'].describe())
print()
print(df_reading_typical['reading_score'].describe())
print()
print(df_reading_high['reading_score'].describe())
print()
print()
print("*** WRITING ***", df_writing_low['writing_score'].describe())
print()
print(df_writing_typical['writing_score'].describe())
print()
print(df_writing_high['writing_score'].describe())
print()
print()


In [None]:
def low_score_classification(score):
    if score <= 10:
        return '0-9'
    elif 9 < score <= 19:
        return '10-19'
    elif 19 < score <= 29:
        return '20-29'
    elif 29 < score <= 39:
        return '30-39'
    elif 39 < score <= 49:
        return '40-49'
    else:
        return '50 - 54'
    
math_low_students = df_math_low.shape[0]
math_low_females = df_math_low.loc[df_math_low['gender'] == 'female', :].shape[0]
math_low_males = df_math_low.loc[df_math_low['gender'] == 'male', :].shape[0]

print(f"*** Low MATH *** students: {math_low_students}, females: {math_low_females}, males: {math_low_males}")
print()
print(f"MATH Students with zero MATH score : {(df_math_low['math_score'] == 0).sum()}")
print()
df_math_low['score_classification'] = df_math_low['math_score'].apply(low_score_classification)
print("Low MATH score classification: \n", df_math_low.groupby(['score_classification'])['math_score'].count())
print()
print("Low MATH lunch: \n", df_math_low.groupby(['lunch'])['math_score'].count())
print()
print("Low MATH prep: \n", df_math_low.groupby(['test_preparation_course'])['math_score'].count())
print()
print(df_math_low.groupby(['race_ethnicity', 'parental_level_of_education'])['math_score'].count())
print()
print()
print()


reading_low_students = df_reading_low.shape[0]
reading_low_females = df_reading_low.loc[df_reading_low['gender'] == 'female', :].shape[0]
reading_low_males = df_reading_low.loc[df_reading_low['gender'] == 'male', :].shape[0]

print(f"*** Low READING *** students: {reading_low_students}, females: {reading_low_females}, males: {reading_low_males}")
print()
print(f"READING  Students with zero reading score : {(df_reading_low['reading_score'] == 0).sum()}")
print()
df_reading_low['score_classification'] = df_reading_low['reading_score'].apply(low_score_classification)
print("Low READING  score classification: \n", df_reading_low.groupby(['score_classification'])['reading_score'].count())
print()
print("Low READING  lunch: \n", df_reading_low.groupby(['lunch'])['reading_score'].count())
print()
print("Low READING  prep: \n", df_reading_low.groupby(['test_preparation_course'])['reading_score'].count())
print()
print(df_reading_low.groupby(['race_ethnicity', 'parental_level_of_education'])['reading_score'].count())
print()
print()
print()


writing_low_students = df_writing_low.shape[0]
writing_low_females = df_writing_low.loc[df_writing_low['gender'] == 'female', :].shape[0]
writing_low_males = df_writing_low.loc[df_writing_low['gender'] == 'male', :].shape[0]

print(f"*** Low WRITING *** students: {writing_low_students}, females: {writing_low_females}, males: {writing_low_males}")
print()
print(f"WRITING  Students with zero writing score : {(df_writing_low['writing_score'] == 0).sum()}")
print()
df_writing_low['score_classification'] = df_writing_low['writing_score'].apply(low_score_classification)
print("Low WRITING  score classification: \n", df_writing_low.groupby(['score_classification'])['writing_score'].count())
print()
print("Low WRITING  lunch: \n", df_writing_low.groupby(['lunch'])['writing_score'].count())
print()
print("Low WRITING  prep: \n", df_writing_low.groupby(['test_preparation_course'])['writing_score'].count())
print()
print(df_writing_low.groupby(['race_ethnicity', 'parental_level_of_education'])['writing_score'].count())
print()
print()
print()

In [None]:
def typical_score_classification(score):
    if score <= 59:
        return '51-59'
    elif 59 < score <= 69:
        return '60-69'
    elif 69 < score <= 79:
        return '70-79'
    else:
        return '80-83'

math_typical_students = df_math_typical.shape[0]
math_typical_females = df_math_typical.loc[df_math_typical['gender'] == 'female', :].shape[0]
math_typical_males = df_math_typical.loc[df_math_typical['gender'] == 'male', :].shape[0]

print(f"*** Typical MATH *** students: {math_typical_students}, females: {math_typical_females}, males: {math_typical_males}")
print()
print(f"Students with PASS score : {(df_math_typical['math_score'] > 59).sum()}")
print(f"Students with FAILED score : {(df_math_typical['math_score'] < 60).sum()}")
print()
df_math_typical['score_classification'] = df_math_typical['math_score'].apply(typical_score_classification)
print("Typical MATH score classification: \n", df_math_typical.groupby(['score_classification'])['math_score'].count())
print()
print("Typical MATH lunch: \n", df_math_typical.groupby(['lunch'])['math_score'].count())
print()
print("Typical MATH lunch failed: \n", df_math_typical.loc[df_math_typical['score_classification'] == '51-59'].groupby(['lunch'])['math_score'].count())
print()
print("Typical MATH lunch passed: \n", df_math_typical.loc[df_math_typical['score_classification'] != '51-59'].groupby(['lunch'])['math_score'].count())
print()
print("Typical MATH course preparation: \n", df_math_typical.groupby(['test_preparation_course'])['math_score'].count())
print()
print(df_math_typical.groupby(['race_ethnicity', 'parental_level_of_education'])['math_score'].count())
print()
print()
print()


reading_typical_students = df_reading_typical.shape[0]
reading_typical_females = df_reading_typical.loc[df_reading_typical['gender'] == 'female', :].shape[0]
reading_typical_males = df_reading_typical.loc[df_reading_typical['gender'] == 'male', :].shape[0]

print(f"*** Typical READING *** students: {reading_typical_students}, females: {reading_typical_females}, males: {reading_typical_males}")
print()
print(f"Students with PASS score : {(df_reading_typical['reading_score'] > 59).sum()}")
print(f"Students with FAILED score : {(df_reading_typical['reading_score'] < 60).sum()}")
print()
df_reading_typical['score_classification'] = df_reading_typical['reading_score'].apply(typical_score_classification)
print("Typical READING score classification: \n", df_reading_typical.groupby(['score_classification'])['reading_score'].count())
print()
print("Typical READING lunch: \n", df_reading_typical.groupby(['lunch'])['reading_score'].count())
print()
print("Typical READING lunch failed: \n", df_reading_typical.loc[df_reading_typical['score_classification'] == '51-59'].groupby(['lunch'])['reading_score'].count())
print()
print("Typical READING lunch passed: \n", df_reading_typical.loc[df_reading_typical['score_classification'] != '51-59'].groupby(['lunch'])['reading_score'].count())
print()
print("Typical READING course preparation: \n", df_reading_typical.groupby(['test_preparation_course'])['reading_score'].count())
print()
print(df_reading_typical.groupby(['race_ethnicity', 'parental_level_of_education'])['reading_score'].count())
print()
print()
print()


writing_typical_students = df_writing_typical.shape[0]
writing_typical_females = df_writing_typical.loc[df_writing_typical['gender'] == 'female', :].shape[0]
writing_typical_males = df_writing_typical.loc[df_writing_typical['gender'] == 'male', :].shape[0]

print(f"*** Typical WRITING *** students: {writing_typical_students}, females: {writing_typical_females}, males: {writing_typical_males}")
print()
print(f"Students with PASS score : {(df_writing_typical['writing_score'] > 59).sum()}")
print(f"Students with FAILED score : {(df_writing_typical['writing_score'] < 60).sum()}")
print()
df_writing_typical['score_classification'] = df_writing_typical['writing_score'].apply(typical_score_classification)
print("Typical WRITING  score classification: \n", df_writing_typical.groupby(['score_classification'])['writing_score'].count())
print()
print("Typical WRITING  lunch: \n", df_writing_typical.groupby(['lunch'])['writing_score'].count())
print()
print("Typical WRITING  lunch failed: \n", df_writing_typical.loc[df_writing_typical['score_classification'] == '51-59'].groupby(['lunch'])['writing_score'].count())
print()
print("Typical WRITING  lunch passed: \n", df_writing_typical.loc[df_writing_typical['score_classification'] != '51-59'].groupby(['lunch'])['writing_score'].count())
print()
print("Typical WRITING  course preparation: \n", df_writing_typical.groupby(['test_preparation_course'])['writing_score'].count())
print()
print(df_writing_typical.groupby(['race_ethnicity', 'parental_level_of_education'])['writing_score'].count())
print()
print()
print()

In [None]:
def high_score_classification(score):
    if score <= 89:
        return '82-89'
    elif 89 < score <= 99:
        return '90-99'
    else:
        return '100'

math_high_students = df_math_high.shape[0]
math_high_females = df_math_high.loc[df_math_high['gender'] == 'female', :].shape[0]
math_high_males = df_math_high.loc[df_math_high['gender'] == 'male', :].shape[0]

print(f"*** High MATH *** students: {math_high_students}, females: {math_high_females}, High MATH males: {math_high_males}")
print()
print(f"Students with 100 MATH score : {(df_math_high['math_score'] == 100).sum()}")
print()
score100_female = df_math_high.loc[(df_math_high['math_score'] == 100) & (df_math_high['gender'] == 'female')].shape[0]
print(f"Number of females with a score of 100: {score100_female}")
score100_male = df_math_high.loc[(df_math_high['math_score'] == 100) & (df_math_high['gender'] == 'male')].shape[0]
print(f"Number of males with a score of 100: {score100_male}")
print()
df_math_high['score_classification'] = df_math_high['math_score'].apply(high_score_classification)
print("High MATH score classification: \n", df_math_high.groupby(['score_classification'])['math_score'].count())
print()
print("High MATH lunch: \n", df_math_high.groupby(['lunch'])['math_score'].count())
print()
print("High MATH prep: \n", df_math_high.groupby(['test_preparation_course'])['math_score'].count())
print()
print(df_math_high.groupby(['race_ethnicity', 'parental_level_of_education'])['math_score'].count())
print()
print()
print()


reading_high_students = df_reading_high.shape[0]
reading_high_females = df_reading_high.loc[df_reading_high['gender'] == 'female', :].shape[0]
reading_high_males = df_reading_high.loc[df_reading_high['gender'] == 'male', :].shape[0]

print(f"*** High READING *** students: {reading_high_students}, females: {reading_high_females}, High reading males: {reading_high_males}")
print()
print(f"Students with 100 reading score : {(df_reading_high['reading_score'] == 100).sum()}")
print()
score100_female = df_reading_high.loc[(df_reading_high['reading_score'] == 100) & (df_reading_high['gender'] == 'female')].shape[0]
print(f"Number of females with a score of 100: {score100_female}")
score100_male = df_reading_high.loc[(df_reading_high['reading_score'] == 100) & (df_reading_high['gender'] == 'male')].shape[0]
print(f"Number of males with a score of 100: {score100_male}")
print()
df_reading_high['score_classification'] = df_reading_high['reading_score'].apply(high_score_classification)
print("High READING score classification: \n", df_reading_high.groupby(['score_classification'])['reading_score'].count())
print()
print("High READING lunch: \n", df_reading_high.groupby(['lunch'])['reading_score'].count())
print()
print("High READING prep: \n", df_reading_high.groupby(['test_preparation_course'])['reading_score'].count())
print()
print(df_reading_high.groupby(['race_ethnicity', 'parental_level_of_education'])['reading_score'].count())
print()
print()
print()


writing_high_students = df_writing_high.shape[0]
writing_high_females = df_writing_high.loc[df_writing_high['gender'] == 'female', :].shape[0]
writing_high_males = df_writing_high.loc[df_writing_high['gender'] == 'male', :].shape[0]

print(f"*** High WRITING *** students: {writing_high_students}, females: {writing_high_females}, High writing males: {writing_high_males}")
print()
print(f"Students with 100 writing score : {(df_writing_high['writing_score'] == 100).sum()}")
print()
score100_female = df_writing_high.loc[(df_writing_high['writing_score'] == 100) & (df_writing_high['gender'] == 'female')].shape[0]
print(f"Number of females with a score of 100: {score100_female}")
score100_male = df_writing_high.loc[(df_writing_high['writing_score'] == 100) & (df_writing_high['gender'] == 'male')].shape[0]
print(f"Number of males with a score of 100: {score100_male}")
print()
df_writing_high['score_classification'] = df_writing_high['writing_score'].apply(high_score_classification)
print("High WRITING score classification: \n", df_writing_high.groupby(['score_classification'])['writing_score'].count())
print()
print("High WRITING lunch: \n", df_writing_high.groupby(['lunch'])['writing_score'].count())
print()
print("High WRITING prep: \n", df_writing_high.groupby(['test_preparation_course'])['writing_score'].count())
print()
print(df_writing_high.groupby(['race_ethnicity', 'parental_level_of_education'])['writing_score'].count())
print()
print()
print()

In [None]:
plt.figure(figsize=(15, 7))
plt.boxplot([df_students['math_score'], df_students['reading_score'], df_students['writing_score']], 
			vert=True, patch_artist=True, boxprops=dict(facecolor='lightblue'))
plt.title('Distribution Scores')
plt.xticks([1, 2, 3], ['Math Scores', 'Reading Scores', 'Writing Scores'])
plt.xlabel('Score Categories')
plt.ylabel('Scores')
plt.show()

In [None]:
plt.figure(figsize=(15, 7))
plt.boxplot([df_math_low['math_score'], df_math_typical['math_score'], df_math_high['math_score']], vert=True, patch_artist=True, boxprops=dict(facecolor='lightblue'))
plt.title('Distribution Scores')
plt.xticks([1, 2, 3], ['Low Math Scores', 'Typical Math Scores', 'High Math Scores'])
plt.xlabel('Math Scores by distribution')
plt.ylabel('Scores')
plt.show()

In [None]:
plt.figure(figsize=(15, 7))
plt.boxplot([df_reading_low['reading_score'], df_reading_typical['reading_score'], df_reading_high['reading_score']], vert=True, patch_artist=True, boxprops=dict(facecolor='lightblue'))
plt.title('Distribution Scores')
plt.xticks([1, 2, 3], ['Low Reading Scores', 'Typical Reading Scores', 'High Reading Scores'])
plt.xlabel('Reading Scores by distribution')
plt.ylabel('Scores')
plt.show()

In [None]:
plt.figure(figsize=(15, 7))
plt.boxplot([df_writing_low['writing_score'], df_writing_typical['writing_score'], df_writing_high['writing_score']], vert=True, patch_artist=True, boxprops=dict(facecolor='lightblue'))
plt.title('Distribution Scores')
plt.xticks([1, 2, 3], ['Low Writing Scores', 'Typical Writing Scores', 'High Writing Scores'])
plt.xlabel('Scores by distribution')
plt.ylabel('Scores')
plt.show()

In [None]:
df_pvt_low_gndr_prep = df_low.pivot_table(index='gender', 
                                          columns='test_preparation_course',
                                          values= 'math_score',
                                          aggfunc='count')
print(df_pvt_low_gndr_prep)

df_pvt_low_gndr_prep.plot(kind='bar', stacked=True, figsize=(8, 6), rot=0)
plt.title('Count of Math Scores by Gender and Test Preparation Course')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Test Preparation Course')
plt.show()

In [None]:
df_pvt_typical_gndr_prep = df_typical.pivot_table(index='gender', 
                                          columns='test_preparation_course',
                                          values= 'math_score',
                                          aggfunc='count')
print(df_pvt_typical_gndr_prep)

df_pvt_typical_gndr_prep.plot(kind='bar', stacked=True, figsize=(8, 6), rot=0)
plt.title('Count of Math Scores by Gender and Test Preparation Course')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Test Preparation Course')
plt.show()

In [None]:
df_pvt_high_gndr_prep = df_high.pivot_table(index='gender', 
                                          columns='test_preparation_course',
                                          values= 'math_score',
                                          aggfunc='count')
print(df_pvt_high_gndr_prep)

df_pvt_high_gndr_prep.plot(kind='bar', stacked=True, figsize=(8, 6), rot=0)
plt.title('Count of Math Scores by Gender and Test Preparation Course')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Test Preparation Course')
plt.show()

In [None]:
df_pvt_low_scores = df_low.pivot_table(index='score_classification', 
                                          columns='gender',
                                          values= 'math_score',
                                          aggfunc='count').fillna(0.0)

print(df_pvt_low_scores)

df_pvt_low_scores.plot(kind='bar', figsize=(8, 6))
plt.title('Count of Math Scores by Score Classification and Gender')
plt.xlabel('Score Range')
plt.ylabel('Count')
plt.legend(title='Gender')
plt.show()

In [None]:
df_pvt_typical_scores = df_typical.pivot_table(index='score_classification', 
                                          columns='gender',
                                          values= 'math_score',
                                          aggfunc='count').fillna(0.0)

print(df_pvt_typical_scores)

df_pvt_typical_scores.plot(kind='bar', figsize=(8, 6))
plt.title('Count of Math Scores by Score Classification and Gender')
plt.xlabel('Score Range')
plt.ylabel('Count')
plt.legend(title='Gender')
plt.show()

In [None]:
df_pvt_high_scores = df_high.pivot_table(index='score_classification', 
                                          columns='gender',
                                          values= 'math_score',
                                          aggfunc='count').fillna(0.0)

print(df_pvt_high_scores)

df_pvt_high_scores.plot(kind='bar', figsize=(8, 6), rot=0)
plt.title('Count of Math Scores by Score Classification and Gender')
plt.xlabel('Score Range')
plt.ylabel('Count')
plt.legend(title='Gender')
plt.show()