# Advanced Pandas

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('adult.csv')
df = df.rename(columns={'hours-per-week': 'hours'})
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
df.income.unique()

array(['<=50K', '>50K'], dtype=object)

In [4]:
df.race.value_counts()

White                 41762
Black                  4685
Asian-Pac-Islander     1519
Amer-Indian-Eskimo      470
Other                   406
Name: race, dtype: int64

## Finding Average Work Hours For Each Race

In [7]:
print(df[df.race == 'White'].hours.mean())
print(df[df.race == 'Black'].hours.mean())
print(df[df.race == 'Asian-Pac-Islander'].hours.mean())
print(df[df.race == 'Amer-Indian-Eskimo'].hours.mean())
print(df[df.race == 'Other'].hours.mean())

40.66016953211053
38.59786552828175
39.884792626728114
40.272340425531915
39.20197044334975


## Finding Average Work Hours For Each Race And Income Level

In [8]:
print(df[(df.race == 'White') & (df.income == '>50K')].hours.mean())
print(df[(df.race == 'White') & (df.income == '<=50K')].hours.mean())
# ..., A VERY TEDIOUS JOB!!!
# What can we do?

45.551899688884696
38.994735997432194


## Pivot Table

In [9]:
pd.pivot_table(df, index=['race'], aggfunc='mean')

Unnamed: 0_level_0,age,capital-gain,capital-loss,educational-num,fnlwgt,hours
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Amer-Indian-Eskimo,36.693617,538.976596,37.8,9.387234,120110.985106,40.27234
Asian-Pac-Islander,37.85846,1537.266623,95.557604,10.998683,159796.946675,39.884793
Black,37.914408,588.192316,61.575454,9.491142,230485.070438,38.597866
Other,33.657635,983.224138,62.958128,8.839901,195715.689655,39.20197
White,38.824362,1124.479766,90.915857,10.130262,186894.995738,40.66017


In [11]:
pd.pivot_table(df, index=['race'], aggfunc='mean').hours
# or
pd.pivot_table(df, index=['race'], values=['hours'], aggfunc='mean')

Unnamed: 0_level_0,hours
race,Unnamed: 1_level_1
Amer-Indian-Eskimo,40.27234
Asian-Pac-Islander,39.884793
Black,38.597866
Other,39.20197
White,40.66017


In [14]:
# race and income level
pd.pivot_table(df, index=['race'], columns=['income'], values=['hours'], aggfunc='mean')


Unnamed: 0_level_0,hours,hours
income,<=50K,>50K
race,Unnamed: 1_level_2,Unnamed: 2_level_2
Amer-Indian-Eskimo,39.816867,43.709091
Asian-Pac-Islander,38.012613,44.96577
Black,37.824958,44.222615
Other,38.488764,44.28
White,38.994736,45.5519


In [15]:
# there are several different aggFuncs...

## Group By

In [24]:
inc_gen_df = df.groupby(by=['gender', 'income']).mean()
# inc_gen_df.loc[('Female',  '>50K'), 'hours']
inc_gen_df

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours
gender,income,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,<=50K,36.278999,185690.282604,9.824724,122.876586,46.910976,35.875962
Female,>50K,42.219333,183989.519503,11.832109,4313.663652,180.222725,40.678915
Male,<=50K,37.248548,192799.098671,9.454953,162.322673,58.746173,40.720702
Male,>50K,44.641863,189269.827082,11.561908,3993.827586,195.902299,46.304396


In [25]:
df.groupby(by=['gender', 'income']).agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,fnlwgt,fnlwgt,educational-num,educational-num,capital-gain,capital-gain,capital-loss,capital-loss,hours,hours
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,mean,count,mean,count,mean,count,mean,count,mean,count
gender,income,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Female,<=50K,36.278999,14423,185690.282604,14423,9.824724,14423,122.876586,14423,46.910976,14423,35.875962,14423
Female,>50K,42.219333,1769,183989.519503,1769,11.832109,1769,4313.663652,1769,180.222725,1769,40.678915,1769
Male,<=50K,37.248548,22732,192799.098671,22732,9.454953,22732,162.322673,22732,58.746173,22732,40.720702,22732
Male,>50K,44.641863,9918,189269.827082,9918,11.561908,9918,3993.827586,9918,195.902299,9918,46.304396,9918


## Join

In [34]:
df1 = pd.DataFrame({
    'postcode': [31323, 456433, 23998, 38752, 23421],
    'price': [81.23, 50.22, 41.55, 55.80, 15.52]
})

df2 = pd.DataFrame({
    'postcode': [31323, 456433, 23998, 38752, 23421],
    'quality': [3, 2, 2, 5, 4],
    'address': ['asdfasdf1', 'asdasdf2', 'asdfasdf3', 'asdfasdf4', 'asdasdf5']
})
df1 = df1.set_index('postcode')
df2 = df2.set_index('postcode')

print(df1)
print('\n------------------\n')
print(df2)

          price
postcode       
31323     81.23
456433    50.22
23998     41.55
38752     55.80
23421     15.52

------------------

          quality    address
postcode                    
31323           3  asdfasdf1
456433          2   asdasdf2
23998           2  asdfasdf3
38752           5  asdfasdf4
23421           4   asdasdf5


In [38]:
df1.join(df2)

Unnamed: 0_level_0,price,quality,address
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31323,81.23,3,asdfasdf1
456433,50.22,2,asdasdf2
23998,41.55,2,asdfasdf3
38752,55.8,5,asdfasdf4
23421,15.52,4,asdasdf5


In [41]:
pd.concat([df1, df2], axis=0)

Unnamed: 0_level_0,price,quality,address
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31323,81.23,,
456433,50.22,,
23998,41.55,,
38752,55.8,,
23421,15.52,,
31323,,3.0,asdfasdf1
456433,,2.0,asdasdf2
23998,,2.0,asdfasdf3
38752,,5.0,asdfasdf4
23421,,4.0,asdasdf5


In [42]:
pd.concat([df1, df2], axis=1)

Unnamed: 0_level_0,price,quality,address
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31323,81.23,3,asdfasdf1
456433,50.22,2,asdasdf2
23998,41.55,2,asdfasdf3
38752,55.8,5,asdfasdf4
23421,15.52,4,asdasdf5
