In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Employee_ID       10000 non-null  int64 
 1   Name              10000 non-null  object
 2   Age               10000 non-null  int64 
 3   Gender            10000 non-null  object
 4   Department        10000 non-null  object
 5   Job_Title         10000 non-null  object
 6   Experience_Years  10000 non-null  int64 
 7   Education_Level   10000 non-null  object
 8   Location          10000 non-null  object
 9   Salary            10000 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [4]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [5]:
class Preprocessing:
    def __init__(self, df):
        self.df = df.copy()
        self.encoder = LabelEncoder()
        self.scaler = MinMaxScaler()

    def fillMissingValues(self):
        for col in self.df.columns:
            if self.df[col].isnull().any():
                if self.df[col].dtype == "object":
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                else:
                    self.df[col].fillna(self.df[col].mean(), inplace=True)
        return self
    
    def encode(self):
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                if self.df[col].nunique() <= 5:
                    dummies = pd.get_dummies(self.df[col], prefix=col, dtype=int)
                    self.df = pd.concat([self.df.drop(columns=[col]), dummies], axis=1)
                else:
                    self.df[col] = self.encoder.fit_transform(self.df[col])
        return self
    
    def scale(self):
        num_cols = self.df.select_dtypes(include=['float64', 'int64', 'int32']).columns.drop('Salary')
        self.df[num_cols] = self.scaler.fit_transform(self.df[num_cols])
        return self
    
    def dataset(self):
        return self.df
    
preprocessing = Preprocessing(df)
df = preprocessing.fillMissingValues().encode().scale().dataset()

In [6]:
df.head(15)

Unnamed: 0,Employee_ID,Name,Age,Department,Experience_Years,Salary,Gender_Female,Gender_Male,Job_Title_Analyst,Job_Title_Engineer,...,Job_Title_Intern,Job_Title_Manager,Education_Level_Bachelor,Education_Level_Master,Education_Level_PhD,Location_Austin,Location_Chicago,Location_New_York,Location_San_Francisco,Location_Seattle
0,0.0,0.690382,0.076923,0.0,0.027027,90000,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0001,0.477754,0.897436,1.0,0.891892,195000,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0002,0.12253,0.0,0.0,0.027027,35000,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0003,0.693423,0.230769,0.2,0.243243,75000,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0004,0.761427,0.102564,0.4,0.054054,70000,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0005,0.698186,0.358974,0.2,0.216216,125000,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,0.0006,0.628965,0.051282,0.6,0.054054,60000,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.0007,0.347421,0.666667,0.6,0.648649,145000,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.0008,0.972028,0.564103,0.6,0.432432,135000,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,0.0009,0.956623,0.051282,0.6,0.0,70000,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Train/Test split

In [8]:
from sklearn.model_selection import train_test_split

x = df.drop('Salary', axis=1)
y = df['Salary']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.metrics import r2_score, mean_absolute_error

# LinearRegression

In [10]:
lr = LinearRegression()

In [11]:
lr.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [12]:
y_pred = lr.predict(x_test)

In [13]:
lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

In [14]:
lr_score

0.9917496299983761

In [15]:
lr_mae

3339.4308232477033

# Lasso

In [16]:
lasso = Lasso()

In [17]:
lasso.fit(x_train, y_train)

  model = cd_fast.enet_coordinate_descent(


0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [18]:
y_pred = lasso.predict(x_test)

In [19]:
lasso_score = r2_score(y_test, y_pred)
lasso_mae = mean_absolute_error(y_test, y_pred)

In [20]:
lasso_score

0.9917088214207405

In [21]:
lasso_mae

3350.8230626719605

# Ridge

In [22]:
ridge = Ridge()

In [23]:
ridge.fit(x_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [24]:
y_pred = ridge.predict(x_test)

In [25]:
ridge_score = r2_score(y_test, y_pred)
ridge_mae = mean_absolute_error(y_test, y_pred)

In [26]:
ridge_score

0.9917013959651528

In [27]:
ridge_mae

3354.1639852722988

# Decision Tree

In [28]:
dt = DecisionTreeRegressor(random_state=42)

In [29]:
dt.fit(x_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [30]:
y_pred = dt.predict(x_test)

In [31]:
dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [32]:
dt_score

0.9819942665794915

In [33]:
dt_mae

4695.0

# Random Forest

In [34]:
rf = RandomForestRegressor(random_state=42)

In [35]:
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
y_pred = rf.predict(x_test)

In [37]:
rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [38]:
rf_score

0.9904111878185733

In [39]:
rf_mae

3590.7

# CVM

In [40]:
cvm = SVR(kernel='linear', C=1.0)

In [41]:
cvm.fit(x_train, y_train)

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [42]:
y_pred = cvm.predict(x_test)

In [43]:
cvm_score = r2_score(y_test, y_pred)
cvm_mae = mean_absolute_error(y_test, y_pred)

In [44]:
cvm_score

0.14223093196892167

In [45]:
cvm_mae

35601.30159249471

# KNN

In [46]:
knn = KNeighborsRegressor(n_neighbors=5)

In [47]:
knn.fit(x_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [48]:
y_pred = knn.predict(x_test)

In [49]:
knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [50]:
knn_score

0.9807765665841361

In [51]:
knn_mae

5063.5

# Tabulate

In [52]:
from tabulate import tabulate

In [53]:
result = [
    ['Linear Regression', lr_score, lr_mae],
    ['Lasso', lasso_score, lasso_mae],
    ['Ridge', ridge_score, ridge_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['CVM', cvm_score, cvm_mae],
    ['KNN', knn_score, knn_mae],
]

headers = ['Algorithm', 'r2_score', 'mean_absolute_error']

best_model = max(result, key=lambda x: x[1])

green = "\033[92m"
reset = "\033[0m"

for row in result:
    if row == best_model:
        row[0] = green + row[0] + reset

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.4f')

In [54]:
print(table)

+-------------------+------------+-----------------------+
| Algorithm         |   r2_score |   mean_absolute_error |
| [92mLinear Regression[0m |     0.9917 |             3339.4308 |
+-------------------+------------+-----------------------+
| Lasso             |     0.9917 |             3350.8231 |
+-------------------+------------+-----------------------+
| Ridge             |     0.9917 |             3354.1640 |
+-------------------+------------+-----------------------+
| Decision Tree     |     0.9820 |             4695.0000 |
+-------------------+------------+-----------------------+
| Random Forest     |     0.9904 |             3590.7000 |
+-------------------+------------+-----------------------+
| CVM               |     0.1422 |            35601.3016 |
+-------------------+------------+-----------------------+
| KNN               |     0.9808 |             5063.5000 |
+-------------------+------------+-----------------------+
