# Chapter 21: Creating and Updating Columns

## 21.1 Loading the Data

In [1]:
import pandas as pd
import numpy as np
url = 'https://github.com/mattharrison/datasets/raw/master/data/2020-jetbrains-python-survey.csv'
jb = pd.read_csv(url)
jb    
 

  jb = pd.read_csv(url)


Unnamed: 0,is.python.main,other.lang.None,other.lang.Java,other.lang.JavaScript,other.lang.C/C++,other.lang.PHP,other.lang.C#,other.lang.Ruby,other.lang.Bash / Shell,other.lang.Objective-C,...,job.role.Technical support,job.role.Data analyst,job.role.Business analyst,job.role.Team lead,job.role.Product manager,job.role.CIO / CEO / CTO,job.role.Systems analyst,job.role.Other,age,country.live
0,Yes,,,,,,,,Bash / Shell,,...,,,Business analyst,,,,,,30–39,
1,Yes,,Java,JavaScript,,,C#,,,,...,,,,,,,,,21–29,India
2,Yes,,,,C/C++,,,,Bash / Shell,,...,Technical support,Data analyst,,Team lead,,,,,30–39,United States
3,Yes,,,JavaScript,,,,,Bash / Shell,,...,,,,,,,,,,
4,Yes,,Java,JavaScript,C/C++,,,,Bash / Shell,,...,,,,,,,,,21–29,Italy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,Yes,,,,C/C++,,,,Bash / Shell,Objective-C,...,,,,,,,Systems analyst,,21–29,Russian Federation
54458,Yes,,,JavaScript,,,,,Bash / Shell,,...,,,,,,,,,,
54459,Yes,,,JavaScript,,PHP,,,Bash / Shell,,...,,,,,,CIO / CEO / CTO,,,21–29,Russian Federation
54460,Yes,,,JavaScript,C/C++,PHP,,,Bash / Shell,,...,,Data analyst,,,,,,,30–39,Spain


In [6]:
jb.columns

Index(['is.python.main', 'other.lang.None', 'other.lang.Java',
       'other.lang.JavaScript', 'other.lang.C/C++', 'other.lang.PHP',
       'other.lang.C#', 'other.lang.Ruby', 'other.lang.Bash / Shell',
       'other.lang.Objective-C',
       ...
       'job.role.Technical support', 'job.role.Data analyst',
       'job.role.Business analyst', 'job.role.Team lead',
       'job.role.Product manager', 'job.role.CIO / CEO / CTO',
       'job.role.Systems analyst', 'job.role.Other', 'age', 'country.live'],
      dtype='object', length=264)

- Determine whether a feature can be multiple values and removes those

In [2]:
import collections
counter = collections.defaultdict(list)

for col in sorted(jb.columns):
    period_count = col.count('.')
    if period_count >= 2:
        part_end = 2
    else:
        part_end = 1

    parts = col.split('.')[:part_end]
    counter['.'.join(parts)].append(col)

In [5]:
counter

defaultdict(list,
            {'age': ['age'],
             'are.you': ['are.you.datascientist'],
             'bigdata': ['bigdata.Apache Beam',
              'bigdata.Apache Flink',
              'bigdata.Apache Hadoop/MapReduce',
              'bigdata.Apache Hive',
              'bigdata.Apache Kafka',
              'bigdata.Apache Samza',
              'bigdata.Apache Spark',
              'bigdata.Apache Tez',
              'bigdata.ClickHouse',
              'bigdata.Dask',
              'bigdata.None',
              'bigdata.Other'],
             'ci': ['ci.AppVeyor',
              'ci.Bamboo',
              'ci.CircleCI',
              'ci.CruiseControl',
              'ci.Gitlab CI',
              'ci.Jenkins / Hudson',
              'ci.None',
              'ci.Other',
              'ci.TeamCity',
              'ci.Travis CI'],
             'cloud.platform': ['cloud.platform.AWS',
              'cloud.platform.DigitalOcean',
              'cloud.platform.Google Cloud Platfor

In [3]:
uniq_cols = []
for cols in counter.values():
    if len(cols) == 1:
        uniq_cols.extend(cols)

In [4]:
uniq_cols

['age',
 'are.you.datascientist',
 'company.size',
 'country.live',
 'employment.status',
 'first.learn.about.main.ide',
 'how.often.use.main.ide',
 'ide.main',
 'is.python.main',
 'job.team',
 'main.purposes',
 'missing.features.main.ide',
 'nps.main.ide',
 'python.years',
 'python2.version.most',
 'python3.version.most',
 'several.projects',
 'team.size',
 'use.python.most',
 'years.of.coding']

- Column names have a period in them. Replace those with underscore as it will allow us to access the names of the columns via attributes

In [7]:
(jb
[uniq_cols]
.rename(columns=lambda c: c.replace('.', '_'))
.age
.value_counts(dropna=False))

NaN            29701
21–29           9710
30–39           7512
40–49           3010
18–20           2567
50–59           1374
60 or older      588
Name: age, dtype: int64

- Pull out the first two characters from the ``age`` column and convert it to numbers
- We will have to convert it to float because there are missing values

In [10]:
(jb
[uniq_cols]
.rename(columns=lambda c: c.replace('.', '_'))
.age
.str.slice(0,2) # or .str[0:2]
.astype(float)
.astype('Int64'))

0          30
1          21
2          30
3        <NA>
4          21
         ... 
54457      21
54458    <NA>
54459      21
54460      30
54461      21
Name: age, Length: 54462, dtype: Int64

- Put the cleaned column in dataframe using ``.assign`` method

In [11]:
# 1. Pull out the columns we want
# 2. Rename the columns
# 3. Update the age column

(jb
[uniq_cols]
.rename(columns=lambda c: c.replace('.', '_'))
.assign(age=lambda df_:df_.age
                          .str.slice(0,2) # or .str[0:2]
                          .astype(float)
                          .astype('Int64'))
)

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python2_version_most,python3_version_most,several_projects,team_size,use_python_most,years_of_coding
0,30,,Just me,,Partially employed by a company / organization,Conference / User Group,Weekly,PyCharm Community Edition,Yes,Work as an external consultant or trainer,For work,"No, it has all the features I need",3.0,3–5 years,,Python 3_7,"Yes, I work on many different projects",,,1–2 years
1,21,Yes,"More than 5,000",India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",2-7 people,Software prototyping,3–5 years
2,30,No,"More than 5,000",United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3–5 years,,Python 3_6,"Yes, I work on one main and several side projects",,DevOps / System administration / Writing autom...,3–5 years
3,,,,,,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,,Both for work and personal,Yes – Please list:,10.0,11+ years,,Python 3_8,"Yes, I work on many different projects",,Web development,11+ years
4,21,,,Italy,Student,Search engines,Daily,VS Code,Yes,Work on your own project(s) independently,"For personal, educational or side projects","No, it has all the features I need",10.0,1–2 years,,Python 3_8,"Yes, I work on one main and several side projects",,Web development,Less than 1 year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,21,No,2–10,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6–10 years,,Python 3_6,"Yes, I work on many different projects",,Data analysis,1–2 years
54458,,No,,,,,,,Yes,,Both for work and personal,,,3–5 years,,Python 3_7,,,Web development,1–2 years
54459,21,,Just me,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3–5 years,,Python 3_7,"Yes, I work on many different projects",2-7 people,Web development,6–10 years
54460,30,Yes,51–500,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6–10 years,,Python 3_7,"Yes, I work on many different projects",,Data analysis,3–5 years


## 21.2 More Column Cleanup

- Convert the ``are_you_datascientist`` column to a boolean columns

In [13]:
(jb
[uniq_cols]
.rename(columns=lambda c: c.replace('.', '_'))
.assign(age=lambda df_:df_.age
                          .str.slice(0,2) # or .str[0:2]
                          .astype(float)
                          .astype('Int64'),
        are_you_datascientist=lambda df_: df_.are_you_datascientist
                                             .replace({'Yes': True,
                                                       'No': False,
                                                       np.nan: False})
)
.are_you_datascientist
)

0        False
1         True
2        False
3        False
4        False
         ...  
54457    False
54458    False
54459    False
54460     True
54461    False
Name: are_you_datascientist, Length: 54462, dtype: object

- Look at ``company_size``

In [14]:
(jb
[uniq_cols]
.rename(columns=lambda c: c.replace('.', '_'))
.assign(age=lambda df_:df_.age
                          .str.slice(0,2) # or .str[0:2]
                          .astype(float)
                          .astype('Int64'),
        are_you_datascientist=lambda df_: df_.are_you_datascientist
                                             .replace({'Yes': True,
                                                       'No': False,
                                                       np.nan: False})
)
.company_size
.value_counts(dropna=False)
)

NaN                35037
51–500              4608
More than 5,000     3635
11–50               3507
2–10                2558
1,001–5,000         1934
Just me             1492
501–1,000           1165
Not sure             526
Name: company_size, dtype: int64

In [15]:
jb2 = (jb
 [uniq_cols]
 .rename(columns=lambda c: c.replace('.', '_'))
 .assign(age=lambda df_:df_.age.str.slice(0,2).astype(float)
             .astype('Int64'),
         are_you_datascientist=lambda df_:df_.are_you_datascientist
             .replace({'Yes': True, 'No': False, np.nan: False}),
         company_size=lambda df_:df_.company_size.replace({
             'Just me': 1, 'Not sure': np.nan, 
             'More than 5,000': 5000, '2–10': 2, '11–50':11,
             '51–500': 51, '501–1,000':501,
             '1,001–5,000':1001}).astype('Int64'),
         country_live=lambda df_:df_.country_live.astype('category'),
         employment_status=lambda df_:df_.employment_status
              .fillna('Other').astype('category'),
         is_python_main=lambda df_:df_.is_python_main
              .astype('category'),
         team_size=lambda df_:df_.team_size
             .str.split(r'-', n=1, expand=True)
             .iloc[:,0].replace('More than 40 people', 41)
             .where(df_.company_size!=1, 1).astype(float),
         years_of_coding=lambda df_:df_.years_of_coding
             .replace('Less than 1 year', .5).str.extract(r'(\d+)')
             .astype(float),
         python_years=lambda df_:df_.python_years
             .replace('Less than 1 year', .5).str.extract(r'(\d+)')
             .astype(float),
         python3_ver=lambda df_:df_.python3_version_most
              .str.replace('_', '.').str.extract(r'(\d\.\d)')
              .astype(float),
         use_python_most=lambda df_:df_.use_python_most
              .fillna('Unknown')
        )
    .drop(columns=['python2_version_most'])
)


In [16]:
jb2

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
0,30,False,1,,Partially employed by a company / organization,Conference / User Group,Weekly,PyCharm Community Edition,Yes,Work as an external consultant or trainer,For work,"No, it has all the features I need",3.0,3.0,Python 3_7,"Yes, I work on many different projects",1.0,Unknown,1.0,3.7
1,21,True,5000,India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2.0,Software prototyping,3.0,3.6
2,30,False,5000,United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",,DevOps / System administration / Writing autom...,3.0,3.6
3,,False,,,Other,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,,Both for work and personal,Yes – Please list:,10.0,11.0,Python 3_8,"Yes, I work on many different projects",1.0,Web development,11.0,3.8
4,21,False,,Italy,Student,Search engines,Daily,VS Code,Yes,Work on your own project(s) independently,"For personal, educational or side projects","No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",1.0,Web development,,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54457,21,False,2,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",,Data analysis,1.0,3.6
54458,,False,,,Other,,,,Yes,,Both for work and personal,,,3.0,Python 3_7,,1.0,Web development,1.0,3.7
54459,21,False,1,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",1.0,Web development,6.0,3.7
54460,30,True,51,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",,Data analysis,3.0,3.7


- Taking a closer look at ``team_size`` 

In [20]:
(jb2
.query('team_size.isna()') # filter team_size == na
.employment_status
.value_counts(dropna=False)
)

Fully employed by a company / organization                                                        5279
Working student                                                                                    696
Partially employed by a company / organization                                                     482
Self-employed (a person earning income directly from one's own business, trade, or profession)     430
Freelancer (a person pursuing a profession without a long-term commitment to any one employer)       0
Other                                                                                                0
Retired                                                                                              0
Student                                                                                              0
Name: employment_status, dtype: int64

- Call ``.assign`` to use machine learning to predict the missing values for that column by leveraging CatBoost library (Category Boosting)
- A nice feature of this library is that it will accept missing values and also string values
- CatBoost doesn't like native pandas type ('Int64', 'category') so we will make a function ``prep_for_ml``

In [22]:
import catboost as cb 
import numpy as np

In [23]:
def prep_for_ml(df):
    # remove pandas types
    return (df
    .assign(**{col:df[col].astype(float)
              for col in df.select_dtypes('number')},
            **{col:df[col].astype(str).fillna('')
              for col in df.select_dtypes(['object', 'category'])}))

In [24]:
def predict_col(df, col):
    df = prep_for_ml(df)
    missing = df.query(f'~{col}.isna()')
    cat_idx = [i for i,typ in enumerate(df.drop(columns=[col]).dtypes)
               if str(typ) == 'object']
    X = (missing
         .drop(columns=[col])
         .values
        )
    y = missing[col]
    model = cb.CatBoostRegressor(iterations=20, cat_features=cat_idx)
    model.fit(X,y, cat_features=cat_idx)
    pred = model.predict(df.drop(columns=[col]))
    return df[col].where(~df[col].isna(), pred)


In [26]:
jb2 = (jb
  [uniq_cols]
  .rename(columns=lambda c: c.replace('.', '_'))
  .assign(age=lambda df_:df_.age.str.slice(0,2).astype(float)
             .astype('Int64'),
         are_you_datascientist=lambda df_:df_.are_you_datascientist
             .replace({'Yes': True, 'No': False, np.nan: False}),
         company_size=lambda df_:df_.company_size.replace({
             'Just me': 1, 'Not sure': np.nan, 
             'More than 5,000': 5000, '2–10': 2, '11–50':11,
             '51–500': 51, '501–1,000':501,
             '1,001–5,000':1001}).astype('Int64'),
         country_live=lambda df_:df_.country_live.astype('category'),
         employment_status=lambda df_:df_.employment_status
              .fillna('Other').astype('category'),
         is_python_main=lambda df_:df_.is_python_main
              .astype('category'),
         team_size=lambda df_:df_.team_size
             .str.split(r'-', n=1, expand=True)
             .iloc[:,0].replace('More than 40 people', 41)
             .where(df_.company_size!=1, 1).astype(float),
         years_of_coding=lambda df_:df_.years_of_coding
             .replace('Less than 1 year', .5).str.extract(r'(\d+)')
             .astype(float),
         python_years=lambda df_:df_.python_years
             .replace('Less than 1 year', .5).str.extract(r'(\d+)')
             .astype(float),
         python3_ver=lambda df_:df_.python3_version_most
              .str.replace('_', '.').str.extract(r'(\d\.\d)')
              .astype(float),
         use_python_most=lambda df_:df_.use_python_most
              .fillna('Unknown')
        )
  .assign(team_size=lambda df_:predict_col(df_, 'team_size')
          .astype(int))
  .drop(columns=['python2_version_most'])
  .dropna()
)

Learning rate set to 0.5
0:	learn: 2.9695218	total: 26.8ms	remaining: 509ms
1:	learn: 2.8766539	total: 52.7ms	remaining: 475ms
2:	learn: 2.8387189	total: 81.6ms	remaining: 462ms
3:	learn: 2.8028751	total: 110ms	remaining: 440ms
4:	learn: 2.7899957	total: 136ms	remaining: 408ms
5:	learn: 2.7749439	total: 166ms	remaining: 386ms
6:	learn: 2.7719128	total: 198ms	remaining: 368ms
7:	learn: 2.7649792	total: 229ms	remaining: 344ms
8:	learn: 2.7649588	total: 256ms	remaining: 313ms
9:	learn: 2.7630617	total: 286ms	remaining: 286ms
10:	learn: 2.7625779	total: 315ms	remaining: 258ms
11:	learn: 2.7515902	total: 344ms	remaining: 229ms
12:	learn: 2.7513459	total: 370ms	remaining: 199ms
13:	learn: 2.7445634	total: 401ms	remaining: 172ms
14:	learn: 2.7443257	total: 431ms	remaining: 144ms
15:	learn: 2.7423142	total: 467ms	remaining: 117ms
16:	learn: 2.7419143	total: 507ms	remaining: 89.5ms
17:	learn: 2.7399387	total: 547ms	remaining: 60.8ms
18:	learn: 2.7384296	total: 581ms	remaining: 30.6ms
19:	learn:

In [27]:
jb2

Unnamed: 0,age,are_you_datascientist,company_size,country_live,employment_status,first_learn_about_main_ide,how_often_use_main_ide,ide_main,is_python_main,job_team,main_purposes,missing_features_main_ide,nps_main_ide,python_years,python3_version_most,several_projects,team_size,use_python_most,years_of_coding,python3_ver
1,21,True,5000,India,Fully employed by a company / organization,School / University,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",8.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",2,Software prototyping,3.0,3.6
2,30,False,5000,United States,Fully employed by a company / organization,Friend / Colleague,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_6,"Yes, I work on one main and several side projects",4,DevOps / System administration / Writing autom...,3.0,3.6
10,21,False,51,Other country,Fully employed by a company / organization,School / University,Daily,IntelliJ IDEA,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,1.0,Python 3_8,"Yes, I work on one main and several side projects",2,Web development,1.0,3.8
11,21,True,51,United States,Fully employed by a company / organization,Online learning platform / Online course,Daily,PyCharm Community Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",9.0,3.0,Python 3_9,"Yes, I work on many different projects",2,Data analysis,3.0,3.9
13,30,True,5000,Belgium,Fully employed by a company / organization,Social network,Daily,VS Code,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_7,"Yes, I work on many different projects",2,Data analysis,3.0,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54456,30,False,1001,Turkey,Fully employed by a company / organization,Friend / Colleague,Daily,PyCharm Community Edition,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",9.0,1.0,Python 3_6,"Yes, I work on many different projects",5,Machine learning,6.0,3.6
54457,21,False,2,Russian Federation,Fully employed by a company / organization,School / University,Daily,Vim,Yes,Work on your own project(s) independently,Both for work and personal,"No, it has all the features I need",10.0,6.0,Python 3_6,"Yes, I work on many different projects",2,Data analysis,1.0,3.6
54459,21,False,1,Russian Federation,Self-employed (a person earning income directl...,Friend / Colleague,Daily,PyCharm Professional Edition,Yes,Work in a team,Both for work and personal,"No, it has all the features I need",10.0,3.0,Python 3_7,"Yes, I work on many different projects",1,Web development,6.0,3.7
54460,30,True,51,Spain,Fully employed by a company / organization,Search engines,Daily,Other,Yes,Work on your own project(s) independently,Both for work and personal,Yes – Please list:,3.0,6.0,Python 3_7,"Yes, I work on many different projects",4,Data analysis,3.0,3.7
