# Simpson's Paradox
Use `admission_data.csv` for this exercise.

No ejecutar ninguna celda

In [4]:
import pandas as pd

In [5]:
# Load and view first few lines of dataset
df = pd.read_csv('admission_data.csv')
df.head()

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
1,56105,male,Physics,True
2,31441,female,Chemistry,False
3,51765,male,Physics,True
4,53714,female,Physics,True


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
student_id    500 non-null int64
gender        500 non-null object
major         500 non-null object
admitted      500 non-null bool
dtypes: bool(1), int64(1), object(2)
memory usage: 12.3+ KB


### Proportion and admission rate for each gender

In [9]:
# get number of males and females in list
gender_count = df['gender'].value_counts()
 
#get total number of values in gender column in variable
gender_total = df['gender'].count()

In [10]:
gender_count

female    257
male      243
Name: gender, dtype: int64

In [11]:
# Proportion of students that are female
print("Proportion of females is {}%".format(100*gender_count[0]/gender_total))
print("Proportion of females is {}%".format(gender_count[0]/gender_total))

Proportion of females is 51.4%
Proportion of females is 0.514%


In [12]:
# Proportion of students that are male
print("Proportion of males is {}%".format(100*gender_count[1]/gender_total))
print("Proportion of males is {}%".format(gender_count[1]/gender_total))

Proportion of males is 48.6%
Proportion of males is 0.486%


In [13]:
#To view statistics based on gender, we group by gender variable
# Next we use value_counts() function to see the numbers
df.groupby('gender')['admitted'].value_counts()

gender  admitted
female  False       183
        True         74
male    False       125
        True        118
Name: admitted, dtype: int64

In [15]:
# count of females
# https://chrisalbon.com/python/pandas_selecting_rows_on_conditions.html
female_df = df['gender'] == "female"
female_total = sum(female_df)
print("Number of female students are {}".format(female_total))

Number of female students are 257


In [16]:
#count of males
male_df = df['gender'] == "male"
male_total = sum(male_df)
print("Number of male students are {}".format(male_total))

Number of male students are 243


In [17]:
# Admission rate for females
female_admin_rate = 100* 74/female_total
print("Female admission rate is {}%".format(round(female_admin_rate),2))

Female admission rate is 29.0%


In [18]:
# Admission rate for males
male_admin_rate = 100* 118/male_total
print("Male admission rate is {}%".format(round(male_admin_rate,2)))

Male admission rate is 48.56%


In [19]:
print('Total admission proportions')
print("Female: {}".format(74/257))
print("Male: {}".format(118/243))

Total admission proportions
Female: 0.28793774319066145
Male: 0.48559670781893005


### Proportion and admission rate for physics majors of each gender

In [20]:
# group by major and gender to get counts of students in each cohort
major_gender_counts = df.groupby(['major', 'gender']).describe()
major_gender_counts.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, (Chemistry, female) to (Physics, male)
Data columns (total 8 columns):
(student_id, count)    4 non-null float64
(student_id, mean)     4 non-null float64
(student_id, std)      4 non-null float64
(student_id, min)      4 non-null float64
(student_id, 25%)      4 non-null float64
(student_id, 50%)      4 non-null float64
(student_id, 75%)      4 non-null float64
(student_id, max)      4 non-null float64
dtypes: float64(8)
memory usage: 405.0+ bytes


In [21]:
# What proportion of female students are majoring in physics?

# First, get count of females in Physics major 
fem_phy = len(df.loc[(df['gender'] == 'female') & (df['major'] == 'Physics')])
print("Number of females with Physics major is {}".format(fem_phy))

#now evaluate proportion of female candidates in Phsics
print("Proportion of female candidates in Physics is {}%".format(round((fem_phy*100/female_total),2)))

Number of females with Physics major is 31
Proportion of female candidates in Physics is 12.06%


In [22]:
# What proportion of male students are majoring in physics?

# first,get count of males in Physics major 
m_phy = len(df.loc[(df['gender'] == 'male') & (df['major'] == 'Physics')])
print("Number of males with Physics major is {}".format(m_phy))

#now evaluate proportion of male candidates in Physics
print("Proportion of male candidates in Physics is {}%".format(round((m_phy*100/male_total),2)))

Number of males with Physics major is 225
Proportion of male candidates in Physics is 92.59%


In [23]:
# Admission rate for female physics majors
# compute count of females in Physics who got admitted
fem_phy_admit = len(df.loc[(df['gender'] == 'female') 
                           & (df['major'] == 'Physics') 
                           & (df['admitted'] == True)])

#now evaluate proportion of admitted female candidates in Phsics
print("Proportion of female candidates admitted in Physics is {}%".format(round((fem_phy_admit*100/fem_phy),2)))

Proportion of female candidates admitted in Physics is 74.19%


In [24]:
# Admission rate for male physics majors
# compute count of females in Physics who got admitted
m_phy_admit = len(df.loc[(df['gender'] == 'male') 
                           & (df['major'] == 'Physics') 
                           & (df['admitted'] == True)])

#now evaluate proportion of admitted male candidates in Phsics
print("Proportion of male candidates admitted in Physics is {}%".format(round((m_phy_admit*100/m_phy),2)))

Proportion of male candidates admitted in Physics is 51.56%


### Proportion and admission rate for chemistry majors of each gender

In [25]:
# What proportion of female students are majoring in chemistry?

# First, get count of females in Chemistry major 
fem_chem = len(df.loc[(df['gender'] == 'female') & (df['major'] == 'Chemistry')])
print("Number of females with Chemistry major is {}".format(fem_chem))

#now evaluate proportion of female candidates in Chemistry
print("Proportion of female candidates in Chemistry is {}%".format(round((fem_chem*100/female_total),2)))

Number of females with Chemistry major is 226
Proportion of female candidates in Chemistry is 87.94%


In [26]:
# What proportion of male students are majoring in chemistry?
# First, get count of females in Chemistry major 
m_chem = len(df.loc[(df['gender'] == 'male') & (df['major'] == 'Chemistry')])
print("Number of males with Chemistry major is {}".format(m_chem))

#now evaluate proportion of male candidates in Chemistry
print("Proportion of male candidates in Chemistry is {}%".format(round((m_chem*100/male_total),2)))

Number of males with Chemistry major is 18
Proportion of male candidates in Chemistry is 7.41%


In [27]:
# Admission rate for female chemistry majors

# compute count of females in chemistry who got admitted
fem_chem_admit = len(df.loc[(df['gender'] == 'female') 
                           & (df['major'] == 'Chemistry') 
                           & (df['admitted'] == True)])

#now evaluate proportion of admitted female candidates in chemistry
print("Proportion of female candidates admitted in chemistry is {}%".format(round((fem_chem_admit*100/fem_chem),2)))

Proportion of female candidates admitted in chemistry is 22.57%


In [28]:
# Admission rate for male chemistry majors
# compute count of males in chemistry who got admitted
m_chem_admit = len(df.loc[(df['gender'] == 'male') 
                           & (df['major'] == 'Chemistry') 
                           & (df['admitted'] == True)])

#now evaluate proportion of admitted male candidates in chemistry
print("Proportion of male candidates admitted in chemistry is {}%".format(round((m_chem_admit*100/m_chem),2)))

Proportion of male candidates admitted in chemistry is 11.11%


### Admission rate for each major

In [29]:
# Admission rate for physics majors
# compute count of students in Physics who got admitted
phy_admit = len(df.loc[(df['major'] == 'Physics') 
                           & (df['admitted'] == True)])

#now evaluate proportion of admitted candidates in Phsics
print("Proportion of candidates admitted in Physics is {}%".format(round((phy_admit*100/(fem_phy+m_phy)),2)))

Proportion of candidates admitted in Physics is 54.3%


In [30]:
# Admission rate for chemistry majors
# compute count of students in Chemistry who got admitted
chem_admit = len(df.loc[(df['major'] == 'Chemistry') 
                           & (df['admitted'] == True)])

#now evaluate proportion of admitted candidates in Chemistry
print("Proportion of candidates admitted in Chemistry is {}%".format(round((chem_admit*100/(fem_chem+m_chem)),2)))

Proportion of candidates admitted in Chemistry is 21.72%


#### Testing: Double check vlaues by computing non-admittance values

In [31]:
# double check values for physics admittance
if (phy_admit ==  (fem_phy_admit+ m_phy_admit)):
    print("Values computed for Physics admitted candidates is Correct")
else: 
    print("Values computed for Physics admitted candidates is Incorrect")


# double check values for Chemistry admittance
if (chem_admit ==  (fem_chem_admit+ m_chem_admit)):
    print("Values computed for Chemistry admitted candidates is Correct")
else: 
    print("Values computed for Chemistry admitted candidates is Incorrect")


Values computed for Physics admitted candidates is Correct
Values computed for Chemistry admitted candidates is Correct


In [32]:
# double check not admitted candidates
# candidates not admitted to Physics
not_admit_p = len(df.loc[(df['major'] == 'Physics') 
                           & (df['admitted'] == False)])
# candidates not admitted to Chemistry
not_admit_c = len(df.loc[(df['major'] == 'Chemistry') 
                           & (df['admitted'] == False)])
# compare if admitted and non-admitted candidate counts equal total number of students
if (gender_total ==  (not_admit_p+ not_admit_c+chem_admit+phy_admit)):
    print("Values computed for admitted candidates is Correct")
else: 
    print("Values computed for admitted candidates is Incorrect")

Values computed for admitted candidates is Correct
