# Simpson's Paradox
Use `admission_data.csv` for this exercise.

In [1]:
# Load and view first few lines of dataset
import pandas as pd

df = pd.read_csv('datasets/admission_data.csv')

In [2]:
df.head(20)

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
1,56105,male,Physics,True
2,31441,female,Chemistry,False
3,51765,male,Physics,True
4,53714,female,Physics,True
5,50693,female,Chemistry,False
6,25946,male,Physics,True
7,27648,female,Chemistry,True
8,55247,male,Physics,False
9,35838,male,Physics,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   student_id  500 non-null    int64 
 1   gender      500 non-null    object
 2   major       500 non-null    object
 3   admitted    500 non-null    bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 12.3+ KB


### Proportion and admission rate for each gender

In [4]:
# Proportion of students that are female
female_students_count = df[df['gender'] == 'female'].shape[0]
total_students_count = df.shape[0]
proportion_female_students = female_students_count / total_students_count

In [5]:
proportion_female_students

0.514

In [6]:
# Proportion of students that are male
male_students_count = df[df['gender'] == 'male'].shape[0]
proportion_male_students = male_students_count / total_students_count

In [7]:
proportion_male_students

0.486

In [8]:
# Admission rate for females
admission_rate_for_females = (df[(df['gender'] == 'female') & (df['admitted'] == True)].shape[0] / df[df['gender'] == 'female'].shape[0]) * 100

In [9]:
admission_rate_for_females

28.793774319066145

In [10]:
# Admission rate for males
admission_rate_for_males = (df[(df['gender'] == 'male') & (df['admitted'] == True)].shape[0] / df[df['gender'] == 'male'].shape[0]) * 100

In [11]:
admission_rate_for_males

48.559670781893004

### Proportion and admission rate for physics majors of each gender

In [12]:
# What proportion of female students are majoring in physics?
proportion_female_physics_major = df[(df['gender'] == 'female') & (df['major'] == 'Physics')].shape[0] / df[df['gender'] == 'female'].shape[0]

In [13]:
proportion_female_physics_major

0.12062256809338522

In [14]:
# What proportion of male students are majoring in physics?
proportion_male_physics_major = df[(df['gender'] == 'male') & (df['major'] == 'Physics')].shape[0] / df[df['gender'] == 'male'].shape[0]

In [15]:
proportion_male_physics_major

0.9259259259259259

In [16]:
# Admission rate for female physics majors
admission_rate_female_physics_major = df[(df['gender'] == 'female') & (df['major'] == 'Physics') & (df['admitted'] == True)].shape[0] / df[(df['gender'] == 'female') & (df['major'] == 'Physics')].shape[0]

In [17]:
admission_rate_female_physics_major

0.7419354838709677

In [18]:
# Admission rate for male physics majors
admission_rate_male_physics_major = df[(df['gender'] == 'male') & (df['major'] == 'Physics') & (df['admitted'] == True)].shape[0] / df[(df['gender'] == 'male') & (df['major'] == 'Physics')].shape[0]

In [19]:
admission_rate_male_physics_major

0.5155555555555555

### Proportion and admission rate for chemistry majors of each gender

In [20]:
# What proportion of female students are majoring in chemistry?
proportion_female_chemistry_major = df[(df['gender'] == 'female') & (df['major'] == 'Chemistry')].shape[0] / df[df['gender'] == 'female'].shape[0]

In [21]:
proportion_female_chemistry_major

0.8793774319066148

In [22]:
# What proportion of male students are majoring in chemistry?
proportion_male_chemistry_major = df[(df['gender'] == 'male') & (df['major'] == 'Chemistry')].shape[0] / df[df['gender'] == 'male'].shape[0]

In [23]:
proportion_male_chemistry_major

0.07407407407407407

In [24]:
# Admission rate for female chemistry majors
admission_rate_female_chemistry_major = df[(df['gender'] == 'female') & (df['major'] == 'Chemistry') & (df['admitted'] == True)].shape[0] / df[(df['gender'] == 'female') & (df['major'] == 'Chemistry')].shape[0]

In [25]:
admission_rate_female_chemistry_major

0.22566371681415928

In [26]:
# Admission rate for male chemistry majors
admission_rate_male_chemistry_major = df[(df['gender'] == 'male') & (df['major'] == 'Chemistry') & (df['admitted'] == True)].shape[0] / df[(df['gender'] == 'male') & (df['major'] == 'Chemistry')].shape[0]

In [27]:
admission_rate_male_chemistry_major

0.1111111111111111

### Admission rate for each major

In [28]:
# Admission rate for physics majors
admission_rate_physics = df[(df['major'] == 'Physics') & (df['admitted'] == True)].shape[0] / df[df['major'] == 'Physics'].shape[0]

In [29]:
admission_rate_physics

0.54296875

In [30]:
# Admission rate for chemistry majors
admission_rate_chemistry = df[(df['major'] == 'Chemistry') & (df['admitted'] == True)].shape[0] / df[df['major'] == 'Chemistry'].shape[0]

In [31]:
admission_rate_chemistry

0.21721311475409835

The admissions rate for females and males are approximately  29% and 49% respectively. Thus, by looking at the gender and admissions rate it appears that males are being favoured in admissions. But, if we look at the admissions rate for female physics majors and male physics majors, which are approximately 74% and 51% respectively, we find that of the students applying as physics majors, female students appear to have been favored in the admissions process. Again, if we look at the admissions rate for female chemistry majors and male chemistry majors, which are approximately 23% and 11% respectively, we find that of the students applying as chemistry majors, female students appear to have been favored in the admissions process. This is a classic case of Simpson's paradox where the conclusion can be drastically different depending on how we choose to look at the data (or what questions we ask).