In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
names = pd.read_csv("data/names.csv")
majors = pd.read_csv("data/majors.csv")

Transform upper character to lower character

In [3]:
names["Name"] = names["Name"].str.lower()

Display most frequent categorical variable levels

In [4]:
majors["Majors"].value_counts().sort_values().tail(20)

Materials Science & Eng BS                                    8
Data Science BA, Economics BA                                 8
Mechanical Engineering BS                                     8
Business Administration BS, Electrical Eng & Comp Sci BS      8
Industrial Eng & Ops Rsch BS                                  8
Environ Econ & Policy BS                                      9
Chemical Biology BS                                           9
Public Health BA                                             10
Industrial Eng & Ops Rsch MEng                               10
Chemical Engineering BS                                      12
Bioengineering BS                                            14
Molecular & Cell Biology BA                                  16
Cognitive Science BA                                         24
Civil Engineering BS                                         28
Applied Mathematics BA                                       33
Data Science BA                         

Visualize counts using barplot

In [5]:
fig = px.bar(majors["Majors"].value_counts().sort_values().tail(20),
             orientation="h")
fig.update_layout(dict(showlegend=False, xaxis_title="Count", yaxis_title="Major"))

In [6]:
babynames = pd.read_csv("data/babynames.csv")
babynames['Name'] = babynames['Name'].str.lower()
babynames.head()

Unnamed: 0,Name,Sex,Count,Year
0,mary,F,7065,1880
1,anna,F,2604,1880
2,emma,F,2003,1880
3,elizabeth,F,1939,1880
4,minnie,F,1746,1880


Group by year, sum up the counts for each sex.

In [7]:
year_sex = pd.pivot_table(babynames, 
        index=['Year'], # the row index
        columns=['Sex'], # the column values
        values='Count', # the field(s) to processed in each group
        aggfunc=np.sum,
    )

year_sex.head()

Sex,F,M
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1880,90994,110490
1881,91953,100738
1882,107847,113686
1883,112319,104625
1884,129019,114442


Visualize columns (sex) vs. groups (years) of pivot table in line plot

In [8]:
px.line(year_sex)

Computing proportions of sex for each name.

In [9]:
name_sex = pd.pivot_table(
    babynames, index='Name', columns='Sex', values='Count',
    aggfunc='sum', fill_value=0., margins=True)
    
display(name_sex.head())

prop_female = (name_sex['F'] / name_sex['All']).rename("Prop. Female")
prop_female.head(10)

Sex,F,M,All
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aaban,0,120,120
aabha,46,0,46
aabid,0,16,16
aabidah,5,0,5
aabir,0,10,10


Name
aaban        0.0
aabha        1.0
aabid        0.0
aabidah      1.0
aabir        0.0
aabriella    1.0
aada         1.0
aadam        0.0
aadan        0.0
aadarsh      0.0
Name: Prop. Female, dtype: float64

Apply a function to transform a column

In [10]:
def sex_from_name(name):
    lower_name = name.lower()
    if lower_name not in prop_female.index or prop_female[lower_name] == 0.5:
        return "Unknown"
    elif prop_female[lower_name] > 0.5:
        return "F"
    else:
        return "M"
names['Pred. Sex'] = names['Name'].apply(sex_from_name)
names.head()

Unnamed: 0,Name,Pred. Sex
0,andrew,M
1,justin,M
2,michael,M
3,justin,M
4,michael,M


Filter rows of a dataset

In [11]:
names[~names["Name"].isin(prop_female.index)]

Unnamed: 0,Name,Pred. Sex
14,baturay,Unknown
43,subham,Unknown
62,jihee,Unknown
65,qilu,Unknown
87,yike,Unknown
...,...,...
1053,haiyue,Unknown
1062,risheek,Unknown
1066,huanran,Unknown
1068,zefu,Unknown


Merge two tables using `merge`.

In [12]:
names["Prop. Female"] = (names[['Name']].merge(prop_female, how='left', left_on = 'Name', right_index=True)['Prop. Female'].fillna(0.5))
names.head(10)

Unnamed: 0,Name,Pred. Sex,Prop. Female
0,andrew,M,0.00376
1,justin,M,0.00484
2,michael,M,0.004945
3,justin,M,0.00484
4,michael,M,0.004945
5,russell,M,0.004751
6,donna,F,0.99732
7,jessica,F,0.996598
8,andrew,M,0.00376
9,emily,F,0.997956
