# Chapter 26: Reshaping Dataframes with Dummies

In [2]:
import pandas as pd
import numpy as np
url = 'https://github.com/mattharrison/datasets/raw/master/data/'\
      '2020-jetbrains-python-survey.csv'
jb = pd.read_csv(url)
jb.head()  


  jb = pd.read_csv(url)


Unnamed: 0,is.python.main,other.lang.None,other.lang.Java,other.lang.JavaScript,other.lang.C/C++,other.lang.PHP,other.lang.C#,other.lang.Ruby,other.lang.Bash / Shell,other.lang.Objective-C,...,job.role.Technical support,job.role.Data analyst,job.role.Business analyst,job.role.Team lead,job.role.Product manager,job.role.CIO / CEO / CTO,job.role.Systems analyst,job.role.Other,age,country.live
0,Yes,,,,,,,,Bash / Shell,,...,,,Business analyst,,,,,,30–39,
1,Yes,,Java,JavaScript,,,C#,,,,...,,,,,,,,,21–29,India
2,Yes,,,,C/C++,,,,Bash / Shell,,...,Technical support,Data analyst,,Team lead,,,,,30–39,United States
3,Yes,,,JavaScript,,,,,Bash / Shell,,...,,,,,,,,,,
4,Yes,,Java,JavaScript,C/C++,,,,Bash / Shell,,...,,,,,,,,,21–29,Italy


## 26.1 Dummy Columns

In [3]:
# search for columns with job.role title
jb.filter(like='job.role')

Unnamed: 0,job.role.DBA,job.role.Architect,job.role.QA engineer,job.role.Developer / Programmer,job.role.Technical writer,job.role.Technical support,job.role.Data analyst,job.role.Business analyst,job.role.Team lead,job.role.Product manager,job.role.CIO / CEO / CTO,job.role.Systems analyst,job.role.Other
0,,,,,,,,Business analyst,,,,,
1,,,,Developer / Programmer,,,,,,,,,
2,,,,Developer / Programmer,,Technical support,Data analyst,,Team lead,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,,,,,,,,,,,,Systems analyst,
54458,,,,,,,,,,,,,
54459,,,,,,,,,,,CIO / CEO / CTO,,
54460,,,,Developer / Programmer,,,Data analyst,,,,,,


In [7]:
# collapse these jobs into a single column
(jb
 .filter(like=r'job.role')
 .where(jb.isna(), 1)
 .fillna(0)
 .idxmax(axis='columns')
)

0              job.role.Business analyst
1        job.role.Developer / Programmer
2        job.role.Developer / Programmer
3                           job.role.DBA
4                           job.role.DBA
                      ...               
54457           job.role.Systems analyst
54458                       job.role.DBA
54459           job.role.CIO / CEO / CTO
54460    job.role.Developer / Programmer
54461                 job.role.Architect
Length: 54462, dtype: object

In [8]:
# remove the string 'job.role'
job = (jb
 .filter(like=r'job.role')
 .where(jb.isna(), 1)
 .fillna(0)
 .idxmax(axis='columns')
 .str.replace('job.role.', '', regex=False)
)
job

0              Business analyst
1        Developer / Programmer
2        Developer / Programmer
3                           DBA
4                           DBA
                  ...          
54457           Systems analyst
54458                       DBA
54459           CIO / CEO / CTO
54460    Developer / Programmer
54461                 Architect
Length: 54462, dtype: object

- To create dummy columns from a series, we call ``pd.get_dummies`` function

In [10]:
dum = pd.get_dummies(job)
dum

Unnamed: 0,Architect,Business analyst,CIO / CEO / CTO,DBA,Data analyst,Developer / Programmer,Other,Product manager,QA engineer,Systems analyst,Team lead,Technical support,Technical writer
0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,0,0,0,0,0,0,0,0,0,1,0,0,0
54458,0,0,0,1,0,0,0,0,0,0,0,0,0
54459,0,0,1,0,0,0,0,0,0,0,0,0,0
54460,0,0,0,0,0,1,0,0,0,0,0,0,0


## 26.2 Undoing Dummy Columns

- To go from data arranged in dummy columns to a single column

In [11]:
# slowest
dum.idxmax(axis='columns')

0              Business analyst
1        Developer / Programmer
2        Developer / Programmer
3                           DBA
4                           DBA
                  ...          
54457           Systems analyst
54458                       DBA
54459           CIO / CEO / CTO
54460    Developer / Programmer
54461                 Architect
Length: 54462, dtype: object

In [17]:
# fastest
i, j = np.where(dum)
pd.Series(dum.columns[j], i)

0              Business analyst
1        Developer / Programmer
2        Developer / Programmer
3                           DBA
4                           DBA
                  ...          
54457           Systems analyst
54458                       DBA
54459           CIO / CEO / CTO
54460    Developer / Programmer
54461                 Architect
Length: 54462, dtype: object