In [114]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [115]:
df = pd.read_csv('dataset.csv')

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Employee_ID       10000 non-null  int64 
 1   Name              10000 non-null  object
 2   Age               10000 non-null  int64 
 3   Gender            10000 non-null  object
 4   Department        10000 non-null  object
 5   Job_Title         10000 non-null  object
 6   Experience_Years  10000 non-null  int64 
 7   Education_Level   10000 non-null  object
 8   Location          10000 non-null  object
 9   Salary            10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [117]:
df['Work_start_year'] = df['Age'] - df['Experience_Years']

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Employee_ID       10000 non-null  int64 
 1   Name              10000 non-null  object
 2   Age               10000 non-null  int64 
 3   Gender            10000 non-null  object
 4   Department        10000 non-null  object
 5   Job_Title         10000 non-null  object
 6   Experience_Years  10000 non-null  int64 
 7   Education_Level   10000 non-null  object
 8   Location          10000 non-null  object
 9   Salary            10000 non-null  int64 
 10  Work_start_year   10000 non-null  int64 
dtypes: int64(5), object(6)
memory usage: 859.5+ KB


In [119]:
class Preprocessing:
    def __init__(self, df):
        self.df = df.copy()
        self.encoder = LabelEncoder()
        self.scaler = MinMaxScaler()

    def fillMissingValues(self):
        for col in self.df.columns:
            if self.df[col].isnull().any():
                if self.df[col].dtype == "object":
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                else:
                    self.df[col].fillna(self.df[col].mean(), inplace=True)
        return self
    
    def encode(self):
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                if self.df[col].nunique() <= 5:
                    dummies = pd.get_dummies(self.df[col], prefix=col, dtype=int)
                    self.df = pd.concat([self.df.drop(columns=[col]), dummies], axis=1)
                else:
                    self.df[col] = self.encoder.fit_transform(self.df[col])
        return self
    
    def scale(self):
        num_cols = self.df.select_dtypes(include=['float64', 'int64', 'int32']).columns.drop('Salary')
        self.df[num_cols] = self.scaler.fit_transform(self.df[num_cols])
        return self
    
    def dataset(self):
        return self.df
    
preprocessing = Preprocessing(df)
df = preprocessing.fillMissingValues().encode().scale().dataset()

In [120]:
df.head(10)

Unnamed: 0,Employee_ID,Name,Age,Department,Experience_Years,Salary,Work_start_year,Gender_Female,Gender_Male,Job_Title_Analyst,...,Job_Title_Intern,Job_Title_Manager,Education_Level_Bachelor,Education_Level_Master,Education_Level_PhD,Location_Austin,Location_Chicago,Location_New_York,Location_San_Francisco,Location_Seattle
0,0.0,0.690382,0.076923,0.0,0.027027,90000,0.428571,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0001,0.477754,0.897436,1.0,0.891892,195000,0.428571,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0002,0.12253,0.0,0.0,0.027027,35000,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0003,0.693423,0.230769,0.2,0.243243,75000,0.142857,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0004,0.761427,0.102564,0.4,0.054054,70000,0.428571,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0005,0.698186,0.358974,0.2,0.216216,125000,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,0.0006,0.628965,0.051282,0.6,0.054054,60000,0.142857,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.0007,0.347421,0.666667,0.6,0.648649,145000,0.428571,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.0008,0.972028,0.564103,0.6,0.432432,135000,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,0.0009,0.956623,0.051282,0.6,0.0,70000,0.428571,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [121]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Train/Test split

In [122]:
from sklearn.model_selection import train_test_split

x = df.drop('Salary', axis=1)
y = df['Salary']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [123]:
from sklearn.metrics import r2_score, mean_absolute_error

# LinearRegression

In [124]:
lr = LinearRegression()

In [125]:
lr.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [126]:
y_pred = lr.predict(x_test)

In [127]:
lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

In [128]:
lr_score

0.9917496299983761

In [129]:
lr_mae

3339.4308232477065

# Lasso

In [130]:
lasso = Lasso()

In [131]:
lasso.fit(x_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [132]:
y_pred = lasso.predict(x_test)

In [133]:
lasso_score = r2_score(y_test, y_pred)
lasso_mae = mean_absolute_error(y_test, y_pred)

In [134]:
lasso_score

0.9917442094766824

In [135]:
lasso_mae

3341.386938384162

# Ridge

In [136]:
ridge = Ridge()

In [137]:
ridge.fit(x_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [138]:
y_pred = ridge.predict(x_test)

In [139]:
ridge_score = r2_score(y_test, y_pred)
ridge_mae = mean_absolute_error(y_test, y_pred)

In [140]:
ridge_score

0.9917493802341038

In [141]:
ridge_mae

3340.5390839093657

# Decision Tree

In [142]:
dt = DecisionTreeRegressor(random_state=42)

In [143]:
dt.fit(x_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [144]:
y_pred = dt.predict(x_test)

In [145]:
dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [146]:
dt_score

0.9820772969707586

In [147]:
dt_mae

4645.0

# Random Forest

In [148]:
rf = RandomForestRegressor(random_state=42)

In [149]:
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [150]:
y_pred = rf.predict(x_test)

In [151]:
rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [152]:
rf_score

0.9903858344886

In [153]:
rf_mae

3598.725

# CVM

In [154]:
cvm = SVR(kernel='linear', C=1.0)

In [155]:
cvm.fit(x_train, y_train)

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [156]:
y_pred = cvm.predict(x_test)

In [157]:
cvm_score = r2_score(y_test, y_pred)
cvm_mae = mean_absolute_error(y_test, y_pred)

In [158]:
cvm_score

0.1496860589910256

In [159]:
cvm_mae

35410.9847289772

# KNN

In [160]:
knn = KNeighborsRegressor(n_neighbors=5)

In [161]:
knn.fit(x_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [162]:
y_pred = knn.predict(x_test)

In [163]:
knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [164]:
knn_score

0.9808584108269567

In [165]:
knn_mae

5044.0

# Tabulate

In [166]:
from tabulate import tabulate

In [177]:
result = [
    ['Linear Regression', lr_score, lr_mae],
    ['Lasso', lasso_score, lasso_mae],
    ['Ridge', ridge_score, ridge_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['CVM', cvm_score, cvm_mae],
    ['KNN', knn_score, knn_mae],
]

headers = ['Algorithm', 'r2_score', 'mean_absolute_error']

best_model = max(result, key=lambda x: x[1])

green = "\033[92m"
reset = "\033[0m"

for row in result:
    if row == best_model:
        row[0] = green + row[0] + reset

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.4f')

In [178]:
print(table)

+-------------------+------------+-----------------------+
| Algorithm         |   r2_score |   mean_absolute_error |
| [92mLinear Regression[0m |     0.9917 |             3339.4308 |
+-------------------+------------+-----------------------+
| Lasso             |     0.9917 |             3341.3869 |
+-------------------+------------+-----------------------+
| Ridge             |     0.9917 |             3340.5391 |
+-------------------+------------+-----------------------+
| Decision Tree     |     0.9821 |             4645.0000 |
+-------------------+------------+-----------------------+
| Random Forest     |     0.9904 |             3598.7250 |
+-------------------+------------+-----------------------+
| CVM               |     0.1497 |            35410.9847 |
+-------------------+------------+-----------------------+
| KNN               |     0.9809 |             5044.0000 |
+-------------------+------------+-----------------------+
