# intro to pandas

## Plan

1. read in a dataset
2. getting some immediate information about the data
3. subset our data
    1. by column
    1. by row (filtering)
    1. getting max value locations (`.idxmin()`, `idxmax()`)
4. sort our data
5. make changes to our data
    1. new column names
    1. change column values
    1. create new columns
6. calculate summary stats
    1. for the whole dataset
    1. for groups
7. dealing with missing values

## read in a dataset

In [2]:
import pandas as pd

In [3]:
# read a csv file
demographic_data = pd.read_csv("data/life_expectancy_and_income.csv")

In [4]:
# get some immediate info about the data
demographic_data.columns

Index(['country', 'year', 'fertility_rate', 'income_per_person',
       'life_expectancy'],
      dtype='object')

In [6]:
demographic_data.shape

(22080, 5)

In [7]:
type(demographic_data)

pandas.core.frame.DataFrame

In [9]:
demographic_data.head(10)

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
0,Afghanistan,1900,7.0,1090,29.4
1,Afghanistan,1901,7.0,1110,29.5
2,Afghanistan,1902,7.0,1120,29.5
3,Afghanistan,1903,7.0,1140,29.6
4,Afghanistan,1904,7.0,1160,29.7
5,Afghanistan,1905,7.0,1180,29.7
6,Afghanistan,1906,7.0,1200,29.8
7,Afghanistan,1907,7.0,1220,29.9
8,Afghanistan,1908,7.0,1240,29.9
9,Afghanistan,1909,7.0,1260,30.0


In [10]:
#info only on numeric data
demographic_data.describe()

Unnamed: 0,year,fertility_rate,income_per_person,life_expectancy
count,22080.0,22080.0,22080.0,22080.0
mean,1959.5,4.84077,7607.700996,52.567773
std,34.640598,1.916428,13448.4086,16.773059
min,1900.0,1.12,312.0,1.1
25%,1929.75,2.98,1370.0,35.7
50%,1959.5,5.45,2880.0,53.55
75%,1989.25,6.5,7702.5,68.2
max,2019.0,8.87,179000.0,85.1


In [12]:
demographic_data.describe(include = "all")

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
count,22080,22080.0,22080.0,22080.0,22080.0
unique,184,,,,
top,Afghanistan,,,,
freq,120,,,,
mean,,1959.5,4.84077,7607.700996,52.567773
std,,34.640598,1.916428,13448.4086,16.773059
min,,1900.0,1.12,312.0,1.1
25%,,1929.75,2.98,1370.0,35.7
50%,,1959.5,5.45,2880.0,53.55
75%,,1989.25,6.5,7702.5,68.2


In [14]:
demographic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22080 entries, 0 to 22079
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   country            22080 non-null  object 
 1   year               22080 non-null  int64  
 2   fertility_rate     22080 non-null  float64
 3   income_per_person  22080 non-null  int64  
 4   life_expectancy    22080 non-null  float64
dtypes: float64(2), int64(2), object(1)
memory usage: 862.6+ KB


In [23]:
#task
occupation_prestige = pd.read_csv("data/prestige_occupation.csv")

occupation_prestige.shape
#102x7

(102, 7)

In [29]:
occupation_prestige.describe()
#mean prstige is 46.8

Unnamed: 0,education,income,women,prestige,census
count,102.0,102.0,97.0,102.0,102.0
mean,10.738039,6797.901961,30.472784,46.833333,5401.77451
std,2.728444,4245.922227,31.826063,17.204486,2644.993215
min,6.38,611.0,0.52,14.8,1113.0
25%,8.445,4106.0,4.14,35.225,3120.5
50%,10.54,5930.5,15.51,43.6,5135.0
75%,12.6475,8187.25,54.77,59.275,8312.5
max,15.97,25879.0,97.51,87.2,9517.0


## subsetting our data

In [30]:
#by attribute
demographic_data.country

0        Afghanistan
1        Afghanistan
2        Afghanistan
3        Afghanistan
4        Afghanistan
            ...     
22075       Zimbabwe
22076       Zimbabwe
22077       Zimbabwe
22078       Zimbabwe
22079       Zimbabwe
Name: country, Length: 22080, dtype: object

In [31]:
#with brackets
demographic_data["country"]

0        Afghanistan
1        Afghanistan
2        Afghanistan
3        Afghanistan
4        Afghanistan
            ...     
22075       Zimbabwe
22076       Zimbabwe
22077       Zimbabwe
22078       Zimbabwe
22079       Zimbabwe
Name: country, Length: 22080, dtype: object

In [32]:
#brackets useful for list of columned
demographic_data[["country", "year", "life_expectancy"]]

Unnamed: 0,country,year,life_expectancy
0,Afghanistan,1900,29.4
1,Afghanistan,1901,29.5
2,Afghanistan,1902,29.5
3,Afghanistan,1903,29.6
4,Afghanistan,1904,29.7
...,...,...,...
22075,Zimbabwe,2015,59.6
22076,Zimbabwe,2016,60.5
22077,Zimbabwe,2017,61.4
22078,Zimbabwe,2018,61.7


In [35]:
#.loc[]  (preferred) - all rows and selected columns

demographic_data.loc[:, ["country", "year", "income_per_person"]]

Unnamed: 0,country,year,income_per_person
0,Afghanistan,1900,1090
1,Afghanistan,1901,1110
2,Afghanistan,1902,1120
3,Afghanistan,1903,1140
4,Afghanistan,1904,1160
...,...,...,...
22075,Zimbabwe,2015,2510
22076,Zimbabwe,2016,2490
22077,Zimbabwe,2017,2570
22078,Zimbabwe,2018,2620


## filtering data (subsetting rows)

In [41]:
#list of trues and falses
mask = demographic_data.country == "Japan"

#return rows where condition is true
demographic_data.loc[mask]

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
9720,Japan,1900,4.69,1860,38.7
9721,Japan,1901,5.01,1900,38.8
9722,Japan,1902,4.97,1780,39.0
9723,Japan,1903,4.83,1880,39.1
9724,Japan,1904,4.61,1870,39.2
...,...,...,...,...,...
9835,Japan,2015,1.44,37800,84.1
9836,Japan,2016,1.46,38100,84.2
9837,Japan,2017,1.47,38900,84.2
9838,Japan,2018,1.48,39300,84.4


In [42]:
#row filter and column subset
demographic_data.loc[mask, ["country", "year", "life_expectancy"]]

Unnamed: 0,country,year,life_expectancy
9720,Japan,1900,38.7
9721,Japan,1901,38.8
9722,Japan,1902,39.0
9723,Japan,1903,39.1
9724,Japan,1904,39.2
...,...,...,...
9835,Japan,2015,84.1
9836,Japan,2016,84.2
9837,Japan,2017,84.2
9838,Japan,2018,84.4


In [44]:
#multiple masking conditions
mask = (demographic_data.country == "Japan") & (demographic_data.year > 2000)

demographic_data.loc[mask]

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
9821,Japan,2001,1.31,33900,81.7
9822,Japan,2002,1.3,33900,82.0
9823,Japan,2003,1.3,34300,82.1
9824,Japan,2004,1.3,35100,82.3
9825,Japan,2005,1.31,35700,82.3
9826,Japan,2006,1.32,36100,82.6
9827,Japan,2007,1.33,36700,82.8
9828,Japan,2008,1.34,36300,82.9
9829,Japan,2009,1.36,34300,83.1
9830,Japan,2010,1.37,35800,83.1


In [48]:
mask_verbose = (demographic_data.country == "Japan") | (demographic_data.country == "Italy")
mask_convenient = demographic_data.country.isin(["Japan", "Italy"])


demographic_data.loc[mask_convenient]

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
9480,Italy,1900,4.53,3780,41.7
9481,Italy,1901,4.49,3840,43.5
9482,Italy,1902,4.46,3900,43.0
9483,Italy,1903,4.43,3940,43.1
9484,Italy,1904,4.44,4010,44.4
...,...,...,...,...,...
9835,Japan,2015,1.44,37800,84.1
9836,Japan,2016,1.46,38100,84.2
9837,Japan,2017,1.47,38900,84.2
9838,Japan,2018,1.48,39300,84.4


In [49]:
#puting mask code inside loc[]
demographic_data.loc[demographic_data.country.isin(["Japan", "Italy"])]

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
9480,Italy,1900,4.53,3780,41.7
9481,Italy,1901,4.49,3840,43.5
9482,Italy,1902,4.46,3900,43.0
9483,Italy,1903,4.43,3940,43.1
9484,Italy,1904,4.44,4010,44.4
...,...,...,...,...,...
9835,Japan,2015,1.44,37800,84.1
9836,Japan,2016,1.46,38100,84.2
9837,Japan,2017,1.47,38900,84.2
9838,Japan,2018,1.48,39300,84.4


## minimum and maximum row locations

In [50]:
#lowest life expectancy location
i = demographic_data.life_expectancy.idxmin()
i

16338

In [51]:
demographic_data.loc[i]

country              Samoa
year                  1918
fertility_rate        6.98
income_per_person     2050
life_expectancy        1.1
Name: 16338, dtype: object

In [53]:
#highest life expectancy 
demographic_data.loc[demographic_data.life_expectancy.idxmax()]

country              Singapore
year                      2019
fertility_rate            1.27
income_per_person        90100
life_expectancy           85.1
Name: 17279, dtype: object

## saving our subsets


In [55]:
my_list = [0, 1, 2, 3, 4]

#instead of creating a new object, the = assigns both ways so both both objects are the same with different labels
my_list_view = my_list
my_list_view.append(10)
print(my_list)

[0, 1, 2, 3, 4, 10]


In [56]:
my_list_copy = my_list.copy()
my_list_copy.remove(10)
print(my_list)
print(my_list_copy)

[0, 1, 2, 3, 4, 10]
[0, 1, 2, 3, 4]


In [57]:
#get me a table with life expectancy data for japan after 2000
mask = (demographic_data.country == "Japan") & (demographic_data.year > 2000)
japan_data = demographic_data.loc[mask, ["country", "year", "life_expectancy"]].copy()

In [58]:
japan_data

Unnamed: 0,country,year,life_expectancy
9821,Japan,2001,81.7
9822,Japan,2002,82.0
9823,Japan,2003,82.1
9824,Japan,2004,82.3
9825,Japan,2005,82.3
9826,Japan,2006,82.6
9827,Japan,2007,82.8
9828,Japan,2008,82.9
9829,Japan,2009,83.1
9830,Japan,2010,83.1


In [69]:
#task 
job_incomes = (
    occupation_prestige.
    loc[:, ["job", "type", "income"]]
    .copy()
)
job_incomes

Unnamed: 0,job,type,income
0,gov.administrators,prof,12351
1,general.managers,prof,25879
2,accountants,prof,9271
3,purchasing.officers,prof,8865
4,chemists,prof,8403
...,...,...,...
97,bus.drivers,bc,5562
98,taxi.drivers,bc,4224
99,longshoremen,bc,4753
100,typesetters,bc,6462


In [70]:
#task2
(occupation_prestige
 .loc[[occupation_prestige.income.idxmax(), occupation_prestige.income.idxmin()]]
)

Unnamed: 0,job,education,income,women,prestige,census,type
1,general.managers,12.26,25879,4.02,69.1,1130,prof
62,babysitters,9.46,611,96.53,25.9,6147,


## sorting our data
`sort.values()` method

In [72]:
#sort demographic data by year and then country
demographic_data.sort_values(["year", "country"])

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
0,Afghanistan,1900,7.00,1090,29.4
120,Albania,1900,4.60,1220,35.4
240,Algeria,1900,6.99,1750,30.2
360,Angola,1900,7.00,958,29.0
480,Antigua and Barbuda,1900,4.63,1300,33.8
...,...,...,...,...,...
21599,Venezuela,2019,2.25,9720,75.1
21719,Vietnam,2019,1.94,6970,74.7
21839,Yemen,2019,3.69,2340,68.1
21959,Zambia,2019,4.81,3700,64.0


In [73]:
#reverse sorting order `ascending = False`
demographic_data.sort_values("life_expectancy", ascending=False)

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
17279,Singapore,2019,1.27,90100,85.10
17278,Singapore,2018,1.26,90100,85.00
17277,Singapore,2017,1.25,87800,84.80
17276,Singapore,2016,1.25,84700,84.70
9839,Japan,2019,1.50,39700,84.50
...,...,...,...,...,...
19938,Tonga,1918,6.51,969,5.96
3378,Cameroon,1918,5.54,1030,5.95
13444,Namibia,1904,5.96,1900,5.19
9993,Kazakhstan,1933,5.85,3120,4.07


In [74]:
#overwriting our data `.sort_values()` can have `inplace=True` for this 
demographic_data.sort_values(["year", "country"], inplace=True)
demographic_data

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
0,Afghanistan,1900,7.00,1090,29.4
120,Albania,1900,4.60,1220,35.4
240,Algeria,1900,6.99,1750,30.2
360,Angola,1900,7.00,958,29.0
480,Antigua and Barbuda,1900,4.63,1300,33.8
...,...,...,...,...,...
21599,Venezuela,2019,2.25,9720,75.1
21719,Vietnam,2019,1.94,6970,74.7
21839,Yemen,2019,3.69,2340,68.1
21959,Zambia,2019,4.81,3700,64.0


## changing our data 

df.rename(columns = {"old_value":"new_value"})

In [81]:
#rename income_per_person to gdp
demographic_data.rename(columns = {"income_per_person":"gdp"}, inplace=True)
demographic_data

Unnamed: 0,country,year,fertility_rate,gdp,life_expectancy
0,Afghanistan,1900,7.00,1090,29.4
120,Albania,1900,4.60,1220,35.4
240,Algeria,1900,6.99,1750,30.2
360,Angola,1900,7.00,958,29.0
480,Antigua and Barbuda,1900,4.63,1300,33.8
...,...,...,...,...,...
21599,Venezuela,2019,2.25,9720,75.1
21719,Vietnam,2019,1.94,6970,74.7
21839,Yemen,2019,3.69,2340,68.1
21959,Zambia,2019,4.81,3700,64.0


In [82]:
demographic_data.rename(columns={"country":"place", "year":"time"})

Unnamed: 0,place,time,fertility_rate,gdp,life_expectancy
0,Afghanistan,1900,7.00,1090,29.4
120,Albania,1900,4.60,1220,35.4
240,Algeria,1900,6.99,1750,30.2
360,Angola,1900,7.00,958,29.0
480,Antigua and Barbuda,1900,4.63,1300,33.8
...,...,...,...,...,...
21599,Venezuela,2019,2.25,9720,75.1
21719,Vietnam,2019,1.94,6970,74.7
21839,Yemen,2019,3.69,2340,68.1
21959,Zambia,2019,4.81,3700,64.0


## changing columns

In [86]:
#lets quadruple fertility rate (for a laugh) & overwrite that column
demographic_data.fertility_rate = demographic_data.fertility_rate * 4

In [87]:
demographic_data

Unnamed: 0,country,year,fertility_rate,gdp,life_expectancy
0,Afghanistan,1900,28.00,1090,29.4
120,Albania,1900,18.40,1220,35.4
240,Algeria,1900,27.96,1750,30.2
360,Angola,1900,28.00,958,29.0
480,Antigua and Barbuda,1900,18.52,1300,33.8
...,...,...,...,...,...
21599,Venezuela,2019,9.00,9720,75.1
21719,Vietnam,2019,7.76,6970,74.7
21839,Yemen,2019,14.76,2340,68.1
21959,Zambia,2019,19.24,3700,64.0


## creating new columns

In [88]:
#method 1
demographic_data["fertility_rate_updated"] = demographic_data.fertility_rate / 4

In [90]:
#assign method - doesn't change original data
demo_data_since_1900 = (
    demographic_data
    .assign(
        #create new
        years_since_1900 = demographic_data.year - 1900,
        #modify existing
        fertility_rate = round(demographic_data.fertility_rate)
    )
    .copy()
)

In [95]:
#task
occupation_prestige.assign(income_1000s = occupation_prestige.income / 1000)

Unnamed: 0,job,education,income,women,prestige,census,type,income_1000s
0,gov.administrators,13.11,12351,11.16,68.8,1113,prof,12.351
1,general.managers,12.26,25879,4.02,69.1,1130,prof,25.879
2,accountants,12.77,9271,15.70,63.4,1171,prof,9.271
3,purchasing.officers,11.42,8865,9.11,56.8,1175,prof,8.865
4,chemists,14.62,8403,11.68,73.5,2111,prof,8.403
...,...,...,...,...,...,...,...,...
97,bus.drivers,7.58,5562,9.47,35.9,9171,bc,5.562
98,taxi.drivers,7.93,4224,3.59,25.1,9173,bc,4.224
99,longshoremen,8.37,4753,,26.1,9313,bc,4.753
100,typesetters,10.00,6462,13.58,42.2,9511,bc,6.462


## summarisng the data 

In [97]:
#min ferility rate
demographic_data.fertility_rate.min()

4.48

In [98]:
#median life expectancy
demographic_data.life_expectancy.median()

53.55

In [99]:
#filter table where life exp lower than median value
mask = demographic_data.life_expectancy < demographic_data.life_expectancy.median()
demographic_data.loc[mask]

Unnamed: 0,country,year,fertility_rate,gdp,life_expectancy,fertility_rate_updated
0,Afghanistan,1900,28.00,1090,29.4,7.00
120,Albania,1900,18.40,1220,35.4,4.60
240,Algeria,1900,27.96,1750,30.2,6.99
360,Angola,1900,28.00,958,29.0,7.00
480,Antigua and Barbuda,1900,18.52,1300,33.8,4.63
...,...,...,...,...,...,...
3836,Central African Republic,2016,19.48,731,51.7,4.87
11036,Lesotho,2016,12.36,2940,52.5,3.09
3837,Central African Republic,2017,19.20,754,51.9,4.80
3838,Central African Republic,2018,18.88,775,52.4,4.72


## summarising the data for groups

In [106]:
#table of max life expectancy for each country
(
    demographic_data
    #group
    .groupby("country")
    #select column
    .life_expectancy
    #define statistic
    .max()
    #reset to df
    .reset_index(name="max_life_expectancy")
)

Unnamed: 0,country,max_life_expectancy
0,Afghanistan,64.1
1,Albania,78.5
2,Algeria,78.1
3,Angola,65.0
4,Antigua and Barbuda,77.3
...,...,...
179,Venezuela,75.3
180,Vietnam,74.7
181,Yemen,69.0
182,Zambia,64.0


In [109]:
#task
(
    occupation_prestige
    .groupby("type")
    .income
    .max()
    .reset_index(name="max_income")
)

Unnamed: 0,type,max_income
0,bc,8895
1,prof,25879
2,wc,8780


In [120]:
(
    occupation_prestige
    .groupby("type")
    .prestige
    .mean()
    .reset_index(name="avg_prestige")
)

Unnamed: 0,type,avg_prestige
0,bc,35.527273
1,prof,67.848387
2,wc,42.243478


In [119]:
(
    occupation_prestige
    .groupby(["type"])
    .women
    .min()
    .reset_index(name="lowest_pcnt_wmn")
)

Unnamed: 0,type,lowest_pcnt_wmn
0,bc,0.52
1,prof,0.58
2,wc,3.16


## dealing with missing values

In [122]:
#check for missing values
demographic_data.isna().sum()

country                   0
year                      0
fertility_rate            0
gdp                       0
life_expectancy           0
fertility_rate_updated    0
dtype: int64

In [128]:
#drop missing values
demo_missing = pd.read_csv("data/life_expectancy_and_income_missing.csv")

In [131]:
demo_missing.isna().sum()
demo_missing.dropna() #drop all nas

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
0,Afghanistan,1900,7.00,1090.0,29.4
2,Afghanistan,1902,7.00,1120.0,29.5
3,Afghanistan,1903,7.00,1140.0,29.6
4,Afghanistan,1904,7.00,1160.0,29.7
5,Afghanistan,1905,7.00,1180.0,29.7
...,...,...,...,...,...
22075,Zimbabwe,2015,3.84,2510.0,59.6
22076,Zimbabwe,2016,3.76,2490.0,60.5
22077,Zimbabwe,2017,3.68,2570.0,61.4
22078,Zimbabwe,2018,3.61,2620.0,61.7


In [133]:
demo_missing.dropna(subset = ["life_expectancy"]) #drop for specific column

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
0,Afghanistan,1900,7.00,1090.0,29.4
2,Afghanistan,1902,7.00,1120.0,29.5
3,Afghanistan,1903,7.00,1140.0,29.6
4,Afghanistan,1904,7.00,1160.0,29.7
5,Afghanistan,1905,7.00,1180.0,29.7
...,...,...,...,...,...
22075,Zimbabwe,2015,3.84,2510.0,59.6
22076,Zimbabwe,2016,3.76,2490.0,60.5
22077,Zimbabwe,2017,3.68,2570.0,61.4
22078,Zimbabwe,2018,3.61,2620.0,61.7


In [136]:
#imputing missing values with 0
demo_missing.fillna(value = 0)

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
0,Afghanistan,1900,7.00,1090.0,29.4
1,Afghanistan,1901,7.00,1110.0,0.0
2,Afghanistan,1902,7.00,1120.0,29.5
3,Afghanistan,1903,7.00,1140.0,29.6
4,Afghanistan,1904,7.00,1160.0,29.7
...,...,...,...,...,...
22075,Zimbabwe,2015,3.84,2510.0,59.6
22076,Zimbabwe,2016,3.76,2490.0,60.5
22077,Zimbabwe,2017,3.68,2570.0,61.4
22078,Zimbabwe,2018,3.61,2620.0,61.7


In [138]:
#imputing missing values with mean/median
demo_missing.fillna(
value = {
    'life_expectancy' : demo_missing.life_expectancy.mean(),
    'income_per_person' : demo_missing.income_per_person.median()
}
)

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
0,Afghanistan,1900,7.00,1090.0,29.400000
1,Afghanistan,1901,7.00,1110.0,52.725449
2,Afghanistan,1902,7.00,1120.0,29.500000
3,Afghanistan,1903,7.00,1140.0,29.600000
4,Afghanistan,1904,7.00,1160.0,29.700000
...,...,...,...,...,...
22075,Zimbabwe,2015,3.84,2510.0,59.600000
22076,Zimbabwe,2016,3.76,2490.0,60.500000
22077,Zimbabwe,2017,3.68,2570.0,61.400000
22078,Zimbabwe,2018,3.61,2620.0,61.700000


In [140]:
#impute missing values in life_expectancy with mean life exp. of that country

mean_per_country = demo_missing.groupby("country").life_expectancy.transform("mean")
#transform appends mean to every row, not just once for each group/country 

demo_missing.fillna(
value = {
    'life_expectancy' : mean_per_country,
    'income_per_person' : demo_missing.income_per_person.median()
}
)

Unnamed: 0,country,year,fertility_rate,income_per_person,life_expectancy
0,Afghanistan,1900,7.00,1090.0,29.400000
1,Afghanistan,1901,7.00,1110.0,41.233361
2,Afghanistan,1902,7.00,1120.0,29.500000
3,Afghanistan,1903,7.00,1140.0,29.600000
4,Afghanistan,1904,7.00,1160.0,29.700000
...,...,...,...,...,...
22075,Zimbabwe,2015,3.84,2510.0,59.600000
22076,Zimbabwe,2016,3.76,2490.0,60.500000
22077,Zimbabwe,2017,3.68,2570.0,61.400000
22078,Zimbabwe,2018,3.61,2620.0,61.700000


In [142]:
#task
occupation_prestige.isna().sum()

job          0
education    0
income       0
women        5
prestige     0
census       0
type         4
dtype: int64

In [152]:
occupation_prestige.fillna(value = {'type' : "other"},inplace=True)

In [148]:
occupation_prestige.women.mean()

30.47278350515466

In [161]:
(
    occupation_prestige
    .fillna(value = {
        "women" : 0
    }).women.mean()
)

28.979019607843156

In [None]:
#culmination of everything (no idea what)
(
    occupation_prestige
    .fillna(value = {
        'women': 0
    })
    .loc[(occupation_prestige.type != 'other'), ['job', 'income', 'women', 'type']]
    .assign(income_1000s = occupation_prestige.income / 1000)
    .groupby('type')
    .income_1000s
    .mean()
)