In [700]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [701]:
df = pd.read_csv('dataset.csv')

In [702]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1470 non-null   int64  
 1   Attrition                 1470 non-null   object 
 2   BusinessTravel            1469 non-null   object 
 3   DailyRate                 1470 non-null   int64  
 4   Department                1468 non-null   object 
 5   DistanceFromHome          1470 non-null   int64  
 6   Education                 1469 non-null   float64
 7   EducationField            1470 non-null   object 
 8   EmployeeCount             1469 non-null   float64
 9   EmployeeNumber            1470 non-null   int64  
 10  EnvironmentSatisfaction   1469 non-null   float64
 11  Gender                    1469 non-null   object 
 12  HourlyRate                1468 non-null   float64
 13  JobInvolvement            1470 non-null   int64  
 14  JobLevel

In [703]:
def fillMissingValues(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna(df[col].mean(), inplace=True)
    return df

In [704]:
def Encoder(df):
    encoder = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            if df[col].nunique() <= 3:
                dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
                df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
            else:
                df[col] = encoder.fit_transform(df[col])
    return df

In [705]:
def Scaler(df):
    scaler = MinMaxScaler()
    num_col = df.select_dtypes(include=['float64', 'int64']).columns.drop('MonthlyIncome')
    df[num_col] = scaler.fit_transform(df[num_col])
    return df

In [706]:
df = fillMissingValues(df)
df = Encoder(df)
df = Scaler(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [707]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                1470 non-null   float64
 1   DailyRate                          1470 non-null   float64
 2   DistanceFromHome                   1470 non-null   float64
 3   Education                          1470 non-null   float64
 4   EducationField                     1470 non-null   float64
 5   EmployeeCount                      1470 non-null   float64
 6   EmployeeNumber                     1470 non-null   float64
 7   EnvironmentSatisfaction            1470 non-null   float64
 8   HourlyRate                         1470 non-null   float64
 9   JobInvolvement                     1470 non-null   float64
 10  JobLevel                           1470 non-null   float64
 11  JobRole                            1470 non-null   float

In [708]:
df.head(20)

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,Department_Research_&_Development,Department_Sales,Gender_Female,Gender_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0.547619,0.71582,0.0,0.25,0.2,0.0,0.0,0.333333,0.914286,0.666667,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.738095,0.1267,0.25,0.0,0.2,0.0,0.000484,0.666667,0.442857,0.333333,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.452381,0.909807,0.035714,0.25,0.8,0.0,0.001451,1.0,0.885714,0.333333,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.357143,0.923407,0.071429,0.75,0.2,0.0,0.001935,1.0,0.371429,0.666667,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.214286,0.350036,0.035714,0.0,0.6,0.0,0.002903,0.0,0.142857,0.666667,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
5,0.333333,0.646385,0.035714,0.25,0.2,0.0,0.003387,1.0,0.7,0.666667,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
6,0.97619,0.874732,0.071429,0.5,0.6,0.0,0.004354,0.666667,0.728571,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7,0.285714,0.899069,0.821429,0.0,0.2,0.0,0.004838,1.0,0.528571,0.666667,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
8,0.47619,0.081603,0.785714,0.5,0.2,0.0,0.005322,1.0,0.2,0.333333,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
9,0.428571,0.856836,0.928571,0.5,0.6,0.0,0.005806,0.666667,0.914286,0.666667,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


# Train/Test split

In [709]:
from sklearn.model_selection import train_test_split

x = df.drop('MonthlyIncome', axis=1)
y = df['MonthlyIncome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [710]:
from sklearn.metrics import r2_score, mean_absolute_error

# LinearRegression

In [711]:
lr = LinearRegression()

In [712]:
lr.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [713]:
y_pred = lr.predict(x_test)

In [714]:
lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

In [715]:
lr_score

0.89565173057562

In [716]:
lr_mae

1165.508960570721

# Decision Tree

In [717]:
dt = DecisionTreeRegressor(random_state=42)

In [718]:
dt.fit(x_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [719]:
y_pred = dt.predict(x_test)

In [720]:
dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [721]:
dt_score

0.8842922482630299

In [722]:
dt_mae

1122.171827831337

# Random Forest

In [723]:
rf = RandomForestRegressor(random_state=42)

In [724]:
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [725]:
y_pred = rf.predict(x_test)

In [726]:
rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [727]:
rf_score

0.9294338393465874

In [728]:
rf_mae

919.8511486025903

# CVM

In [729]:
cvm = SVR(kernel='linear', C=50.0)

In [730]:
cvm.fit(x_train, y_train)

0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,50.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [731]:
y_pred = cvm.predict(x_test)

In [732]:
cvm_score = r2_score(y_test, y_pred)
cvm_mae = mean_absolute_error(y_test, y_pred)

In [733]:
cvm_score

0.7162108665854499

In [734]:
cvm_mae

1648.864571550321

# KNN

In [735]:
knn = KNeighborsRegressor(n_neighbors=5)

In [736]:
knn.fit(x_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [737]:
y_pred = knn.predict(x_test)

In [738]:
knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [739]:
knn_score

0.42826901942690065

In [740]:
knn_mae

2526.1649305120823

# Tabulate

In [741]:
from tabulate import tabulate

In [742]:
result = [
    ['Linear Regression', lr_score, lr_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['CVM', cvm_score, cvm_mae],
    ['KNN', knn_score, knn_mae],
]

headers = ['Algorithm', 'r2_score', 'mean_absolute_error']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.2f')

In [743]:
print(table)

+-------------------+------------+-----------------------+
| Algorithm         |   r2_score |   mean_absolute_error |
| Linear Regression |       0.90 |               1165.51 |
+-------------------+------------+-----------------------+
| Decision Tree     |       0.88 |               1122.17 |
+-------------------+------------+-----------------------+
| Random Forest     |       0.93 |                919.85 |
+-------------------+------------+-----------------------+
| CVM               |       0.72 |               1648.86 |
+-------------------+------------+-----------------------+
| KNN               |       0.43 |               2526.16 |
+-------------------+------------+-----------------------+
