In [124]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## 1. Define problem

- **Type of Problem:** Regression
- **Objective:** Predict the total compensation of an employee based on various job-related features.
- **Features:**
    1. `company`: The company the employee works for (categorical).
    2. `company_size`: The size of the company (numerical).
    3. `job_title`: The job title of the employee (categorical).
    4. `level`: The job level of the employee (categorical).
    5. `domain`: The domain of the company or job (categorical).
    6. `yoe_total`: Years of total work experience (numerical).
    7. `yoe_at_company`: Years of work experience at the current company (numerical).
    8. `base`: Base salary (numerical).
    9. `stock`: Stock-related compensation (numerical).
    10. `bonus`: Bonus amount (numerical).
    11. `total_compensation`: **The target variable** - total compensation (numerical).

- **Example Questions:**
    - What is the expected total compensation for an employee with a certain job title, at a specific company, with a given level of experience?
    - How much does the total compensation vary based on job level or company size?

- **Potential Use Cases:**
    - Helping HR departments and employees understand the factors influencing compensation.
    - Guiding salary negotiations by providing estimates based on relevant features.

## Why we choose this as a regression problem?

The problem is chosen as a <font color='#F3E5AB'>regression problem</font> because we aim to predict a continuous value, specifically the total compensation of an employee. Here are some reasons explaining why regression is an appropriate choice:
 1. **Continuous Target Variable:** Total compensation is a continuous variable, not falling into fixed categories. When predicting an exact amount, regression is commonly used.
 2. **Prediction of Specific Quantities:** In this context, we are interested in predicting a specific quantity, such as the exact income that an employee might have in a given scenario.
 3. **Relationship between Features and Target:** Features such as job level, experience, domain, and base salary can significantly influence total compensation. This relationship can be well captured by regression models.
 4. **Model Evaluation:** With regression models, we can use metrics like <font color='#F3E5AB'>Mean Squared Error (MSE)</font> or <font color='#F3E5AB'>R-squared</font> to evaluate prediction performance and measure the difference between predicted and actual values.
 5. **Convenient for Model Interpretation:** Regression models provide a convenient way to interpret the impact of each feature on the target variable. This can be valuable in understanding why the model makes specific predictions.

## 2. Prepare Data

In [125]:
df = pd.read_csv('./data/cleaned_data.csv')
df.head()

Unnamed: 0,company,company_size,job_title,level,domain,yoe_total,yoe_at_company,base,stock,bonus,total_compensation,location
0,Logitech,7250,Software Engineer,I4,Testing (SDET),10,5,190000,10000,0,200000,San Francisco Bay Area
1,Logitech,7250,Software Engineer,I2,ML / AI,4,3,126000,0,7000,133000,"Vancouver, WA"
2,Logitech,7250,Software Engineer,I3,Testing (SDET),11,11,120000,5000,12000,137000,"San Francisco, CA"
3,Logitech,7250,Software Engineer,I4,Production,8,8,100000,10000,0,110000,"Hsin-chu, TP, Taiwan"
4,Logitech,7250,Software Engineer,I4,Android,13,1,185000,15000,18500,218500,"San Francisco, CA"


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1384 entries, 0 to 1383
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   company             1384 non-null   object
 1   company_size        1384 non-null   int64 
 2   job_title           1384 non-null   object
 3   level               1384 non-null   object
 4   domain              1384 non-null   object
 5   yoe_total           1384 non-null   int64 
 6   yoe_at_company      1384 non-null   int64 
 7   base                1384 non-null   int64 
 8   stock               1384 non-null   int64 
 9   bonus               1384 non-null   int64 
 10  total_compensation  1384 non-null   int64 
 11  location            1384 non-null   object
dtypes: int64(7), object(5)
memory usage: 129.9+ KB


**Data cleaning**

 1. Handling missing value

In [127]:
missing_values = df.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0].index
print("Columns with missing values:")
print(columns_with_missing_values)

Columns with missing values:
Index([], dtype='object')


If there are rows or columns with missing values, run the code below

In [128]:
if not columns_with_missing_values.empty:
    #Imputation with mean for numeric columns
    numeric_columns = df.select_dtypes(include='number').columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
    
    #Imputation with mode for categorical columns
    categorical_columns = df.select_dtypes(exclude='number').columns
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

2. Outlier Detection:

In [129]:
#Identify outliers in numerical features
numerical_features = ['company_size', 'yoe_total', 'yoe_at_company', 'base', 'stock', 'bonus', 'total_compensation']

In [130]:
# Calculate z-scores to measure the distance from the mean in terms of standard deviations
z_scores = np.abs((df[numerical_features] - df[numerical_features].mean()) / df[numerical_features].std())

In [131]:
outlier_threshold = 3
outliers = (z_scores > outlier_threshold).any(axis=1)
print("Number of outliers:", outliers.sum())

Number of outliers: 106


In [132]:
if outliers.any():
    df.drop(df.index[outliers], inplace=True)
    mean_values = df[numerical_features].mean().to_numpy()
    df.loc[outliers, numerical_features] = mean_values

In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1278 entries, 0 to 1383
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   company             1278 non-null   object 
 1   company_size        1278 non-null   float64
 2   job_title           1278 non-null   object 
 3   level               1278 non-null   object 
 4   domain              1278 non-null   object 
 5   yoe_total           1278 non-null   float64
 6   yoe_at_company      1278 non-null   float64
 7   base                1278 non-null   float64
 8   stock               1278 non-null   float64
 9   bonus               1278 non-null   float64
 10  total_compensation  1278 non-null   float64
 11  location            1278 non-null   object 
dtypes: float64(7), object(5)
memory usage: 129.8+ KB


3. Consistency Check:

In [134]:
#Check for negative values in columns where they are not allowed
negative_values_columns = ['company_size', 'yoe_total', 'yoe_at_company', 'base', 'stock', 'bonus', 'total_compensation']
negative_values = df[negative_values_columns].lt(0).any()
print("Columns with negative values:")
print(negative_values[negative_values].index)

Columns with negative values:
Index([], dtype='object')


In [135]:
if negative_values.any():
    df[negative_values_columns] = df[negative_values_columns].clip(lower=0)

In [136]:
# mình có thể thêm các điều kiện khác để kiểm tra. Ví dụ như tổng 3 mục base stock và bonus lại nhỏ hơn total_compensation

## 3. Feature Engineering:

1. Categorical Encoding:

In [137]:
label_encoder = LabelEncoder()

# Encode categorical columns using label encoding
df['company'] = label_encoder.fit_transform(df['company'])
df['job_title'] = label_encoder.fit_transform(df['job_title'])
df['level'] = label_encoder.fit_transform(df['level'])
df['domain'] = label_encoder.fit_transform(df['domain'])

# Display the resulting DataFrame after label encoding
print(df.head())

   company  company_size  job_title  level  domain  yoe_total  yoe_at_company  \
0       11        7250.0         24     57     350       10.0             5.0   
1       11        7250.0         24     55     200        4.0             3.0   
2       11        7250.0         24     56     350       11.0            11.0   
3       11        7250.0         24     57     266        8.0             8.0   
4       11        7250.0         24     57      31       13.0             1.0   

       base    stock    bonus  total_compensation                location  
0  190000.0  10000.0      0.0            200000.0  San Francisco Bay Area  
1  126000.0      0.0   7000.0            133000.0           Vancouver, WA  
2  120000.0   5000.0  12000.0            137000.0       San Francisco, CA  
3  100000.0  10000.0      0.0            110000.0    Hsin-chu, TP, Taiwan  
4  185000.0  15000.0  18500.0            218500.0       San Francisco, CA  


In [138]:
#đoạn này tui không biết là có nên chuyển nó hay không nữa
locations = df['location']
df = df.drop(columns=['location'])
location_encoded = pd.get_dummies(locations, prefix='location')
df = pd.concat([df, location_encoded], axis=1)
df.head()

Unnamed: 0,company,company_size,job_title,level,domain,yoe_total,yoe_at_company,base,stock,bonus,...,"location_Vancouver, BC, Canada","location_Vancouver, WA","location_Warsaw, MZ, Poland",location_Washington DC,"location_Washington, DC","location_Waterloo, ON, Canada","location_Westborough, MA","location_Wroclaw, DS, Poland","location_Zurich, ZH, Switzerland",location_hidden
0,11,7250.0,24,57,350,10.0,5.0,190000.0,10000.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,11,7250.0,24,55,200,4.0,3.0,126000.0,0.0,7000.0,...,False,True,False,False,False,False,False,False,False,False
2,11,7250.0,24,56,350,11.0,11.0,120000.0,5000.0,12000.0,...,False,False,False,False,False,False,False,False,False,False
3,11,7250.0,24,57,266,8.0,8.0,100000.0,10000.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,11,7250.0,24,57,31,13.0,1.0,185000.0,15000.0,18500.0,...,False,False,False,False,False,False,False,False,False,False


2. Creating New Features:

In [139]:
#phần này có cũng được, nó chỉ mang mục đÍch là thêm thông tin để mình train thôi
df['bonus_to_base_ratio'] = df['bonus'] / df['base']
df.head()

Unnamed: 0,company,company_size,job_title,level,domain,yoe_total,yoe_at_company,base,stock,bonus,...,"location_Vancouver, WA","location_Warsaw, MZ, Poland",location_Washington DC,"location_Washington, DC","location_Waterloo, ON, Canada","location_Westborough, MA","location_Wroclaw, DS, Poland","location_Zurich, ZH, Switzerland",location_hidden,bonus_to_base_ratio
0,11,7250.0,24,57,350,10.0,5.0,190000.0,10000.0,0.0,...,False,False,False,False,False,False,False,False,False,0.0
1,11,7250.0,24,55,200,4.0,3.0,126000.0,0.0,7000.0,...,True,False,False,False,False,False,False,False,False,0.055556
2,11,7250.0,24,56,350,11.0,11.0,120000.0,5000.0,12000.0,...,False,False,False,False,False,False,False,False,False,0.1
3,11,7250.0,24,57,266,8.0,8.0,100000.0,10000.0,0.0,...,False,False,False,False,False,False,False,False,False,0.0
4,11,7250.0,24,57,31,13.0,1.0,185000.0,15000.0,18500.0,...,False,False,False,False,False,False,False,False,False,0.1
