### Import

In [307]:
import pandas as pd

### Preview data

In [308]:
raw_df = pd.read_csv('./data/raw_data.csv')
raw_df.head()

Unnamed: 0,company,company_size,job_title,level,domain,yoe_total,yoe_at_company,base,stock,bonus,total_compensation,location
0,Logitech,7250,Software Engineer,I4,Testing (SDET),10 yrs,5 yrs,190K,10K,,"$200,000",San Francisco Bay Area
1,Logitech,7250,Software Engineer,I2,ML / AI,4 yrs,3 yrs,126K,,7K,"$133,000","Vancouver, WA"
2,Logitech,7250,Software Engineer,I3,Testing (SDET),11+ yrs,11+ yrs,120K,5K,12K,"$137,000","San Francisco, CA"
3,Logitech,7250,Software Engineer,hidden,hidden,0-1 yrs,0-1 yrs,90K,,,"$90,000","Newark, CA"
4,Logitech,7250,Software Engineer,I4,Production,8 yrs,8 yrs,100K,10K,,"$110,000","Hsin-chu, TP, Taiwan"


### Data shape

In [309]:
shape = raw_df.shape
print(f'Raw data shape: {shape}')

Raw data shape: (2040, 12)


### Col meaning

In [310]:
col_meaning_df = pd.read_csv('./data/schema.csv')
col_meaning_df

Unnamed: 0,Query API,Meaning
0,company,Company
1,company_size,Company Size
2,job_title,Job Title
3,level,Level
4,domain,Domain
5,yoe_total,Years of Experience
6,yoe_at_company,Years of Experience at Company
7,base,Base Salary (USD)
8,stock,Stock (USD)
9,bonus,Bonus (USD)


### Check for duplicates

In [311]:
raw_df.drop_duplicates(inplace=True)
shape = raw_df.shape
print(f'Raw data shape after dropping duplicates: {shape}')

Raw data shape after dropping duplicates: (2038, 12)


### Types

In [312]:
dtypes = raw_df.dtypes
dtypes

company               object
company_size          object
job_title             object
level                 object
domain                object
yoe_total             object
yoe_at_company        object
base                  object
stock                 object
bonus                 object
total_compensation    object
location              object
dtype: object

In [313]:
# convert the company_size column from object (e.g. 2,000) to int (e.g. 2000)
raw_df['company_size'] = raw_df['company_size'].str.replace(',', '').astype(int)
raw_df['company_size'].head()

0    7250
1    7250
2    7250
3    7250
4    7250
Name: company_size, dtype: int64

In [314]:
# convert the base, stock, bonus columns from object (e.g. 80K) to int (e.g. 80000), skip the NaN values
# K -> 1000, M -> 1000000
raw_df['base'] = raw_df['base'].str.replace('K', '').str.replace('M', '000').astype(float) * 1000
raw_df['stock'] = raw_df['stock'].str.replace('K', '').str.replace('M', '000').astype(float) * 1000
raw_df['bonus'] = raw_df['bonus'].str.replace('K', '').str.replace('M', '000').astype(float) * 1000
raw_df[['base', 'stock', 'bonus']].head()

Unnamed: 0,base,stock,bonus
0,190000.0,10000.0,
1,126000.0,,7000.0
2,120000.0,5000.0,12000.0
3,90000.0,,
4,100000.0,10000.0,


In [315]:
# convert the total_compensation column from object (e.g. $240,000) to int (e.g. 240000)
raw_df['total_compensation'] = raw_df['total_compensation'].str.replace('$', '').str.replace(',', '').astype(float)
raw_df['total_compensation'].head()

0    200000.0
1    133000.0
2    137000.0
3     90000.0
4    110000.0
Name: total_compensation, dtype: float64

In [316]:
dtypes = raw_df.dtypes
dtypes

company                object
company_size            int64
job_title              object
level                  object
domain                 object
yoe_total              object
yoe_at_company         object
base                  float64
stock                 float64
bonus                 float64
total_compensation    float64
location               object
dtype: object

### Deal with missing values

In [317]:
# Check for null values
print('Number of null values in each column:')
raw_df.isnull().sum()

Number of null values in each column:


company                 0
company_size            0
job_title               0
level                   0
domain                  1
yoe_total               0
yoe_at_company          0
base                    0
stock                 619
bonus                 940
total_compensation      0
location                0
dtype: int64

In [318]:
def missing_ratio(s):
    return (s.isna().mean() * 100).round(1)

print('Missing values ratio:')
raw_df.apply(missing_ratio)

Missing values ratio:


company                0.0
company_size           0.0
job_title              0.0
level                  0.0
domain                 0.0
yoe_total              0.0
yoe_at_company         0.0
base                   0.0
stock                 30.4
bonus                 46.1
total_compensation     0.0
location               0.0
dtype: float64

In [319]:
# Drop columns with more than 50% null values
raw_df.dropna(thresh=0.5*raw_df.shape[0], axis=1, inplace=True)
raw_df.shape

(2038, 12)

In [320]:
# Fill all the null values in domain column with 'Others'
raw_df['domain'].fillna('Others', inplace=True)
raw_df['domain'].isnull().sum()

0

In [321]:
# Fill all the NaN values in company_size column with the median value
raw_df['company_size'].fillna(raw_df['company_size'].median(), inplace=True)
raw_df['company_size'].isnull().sum()

0

In [322]:
# Fill all the NaN values in base, stock, bonus columns with 0
raw_df['base'].fillna(0, inplace=True)
raw_df['stock'].fillna(0, inplace=True)
raw_df['bonus'].fillna(0, inplace=True)
raw_df[['base', 'stock', 'bonus']].isnull().sum()

base     0
stock    0
bonus    0
dtype: int64

In [323]:
# Fill all the NaN values in total_compensation column with the sum of base, stock, bonus columns
raw_df['total_compensation'].fillna(raw_df['base'] + raw_df['stock'] + raw_df['bonus'], inplace=True)
raw_df['total_compensation'].isnull().sum()

0

In [324]:
# Re-check for null values
print('Number of null values in each column:')
raw_df.isnull().sum()

Number of null values in each column:


company               0
company_size          0
job_title             0
level                 0
domain                0
yoe_total             0
yoe_at_company        0
base                  0
stock                 0
bonus                 0
total_compensation    0
location              0
dtype: int64

In [325]:
raw_df.shape

(2038, 12)

### Is the collected data reasonable?

In [326]:
# total_compensation must be greater than or equal to base, stock, bonus
rows_with_wrong_total_compensation = raw_df[raw_df['total_compensation'] < raw_df['base'] + raw_df['stock'] + raw_df['bonus']]
print(f'Number of rows with wrong total_compensation: {rows_with_wrong_total_compensation.shape[0]}')
print(f'Ratio: {rows_with_wrong_total_compensation.shape[0] / raw_df.shape[0] * 100:.2f}%')
rows_with_wrong_total_compensation.head()

Number of rows with wrong total_compensation: 294
Ratio: 14.43%


Unnamed: 0,company,company_size,job_title,level,domain,yoe_total,yoe_at_company,base,stock,bonus,total_compensation,location
5,Logitech,7250,Software Engineer,I1,ML / AI,2 yrs,0 yrs,123100.0,0.0,0.0,123097.0,"New York, NY"
8,Logitech,7250,Software Engineer,I2,Web Development (Front-End),4 yrs,2 yrs,51500.0,0.0,4800.0,56250.0,"Cork, CK, Ireland"
9,Microsoft,182268,Software Engineer,60,Full Stack,2 yrs,2 yrs,24000.0,7200.0,2400.0,33576.0,"Noida, UP, India"
10,Microsoft,182268,Software Engineer,60,API Development (Back-End),4 yrs,2 yrs,25200.0,4000.0,3600.0,32786.0,"Hyderabad, TS, India"
11,Microsoft,182268,Software Engineer,Principal SDE,Full Stack,13 yrs,11 yrs,73400.0,38300.0,11700.0,123364.0,"Bengaluru, KA, India"


In [327]:
# Drop rows with wrong total_compensation
raw_df.drop(rows_with_wrong_total_compensation.index, inplace=True)
raw_df.shape

(1744, 12)

### Save the preprocessed data

In [328]:
raw_df.to_csv('./data/cleaned_data.csv', index=False)