In [83]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [84]:
df = pd.read_csv('dataset.csv')

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1470 non-null   int64  
 1   Attrition                 1470 non-null   object 
 2   BusinessTravel            1469 non-null   object 
 3   DailyRate                 1470 non-null   int64  
 4   Department                1468 non-null   object 
 5   DistanceFromHome          1470 non-null   int64  
 6   Education                 1469 non-null   float64
 7   EducationField            1470 non-null   object 
 8   EmployeeCount             1469 non-null   float64
 9   EmployeeNumber            1470 non-null   int64  
 10  EnvironmentSatisfaction   1469 non-null   float64
 11  Gender                    1469 non-null   object 
 12  HourlyRate                1468 non-null   float64
 13  JobInvolvement            1470 non-null   int64  
 14  JobLevel

In [86]:
class DataPreprocessor():
    def __init__(self, df):
        self.df = df.copy()
        self.encoder = LabelEncoder()
        self.scaler = MinMaxScaler()

    def fillMissingValues(self):
        for col in self.df.columns:
            if self.df[col].isnull().any():
                if self.df[col].dtype == 'object':
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                else:
                    self.df[col].fillna(self.df[col].mean(), inplace=True)
        return self
    
    def encode(self):
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                if self.df[col].nunique() <= 5:
                    dummies = pd.get_dummies(self.df[col], prefix=col, dtype=int)
                    self.df = pd.concat([self.df.drop(columns=[col]), dummies], axis=1)
                else:
                    self.df[col] = self.encoder.fit_transform(self.df[col])
        return self
    
    def scale(self):
        num_col = self.df.select_dtypes(include=['float64', 'int64']).columns.drop('MonthlyIncome')
        self.df[num_col] = self.scaler.fit_transform(self.df[num_col])
        return self

In [87]:
data_preprocessor = DataPreprocessor(df)
data_preprocessor.fillMissingValues().encode().scale()
df = data_preprocessor.df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mean(), inplace=True)


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                1470 non-null   float64
 1   DailyRate                          1470 non-null   float64
 2   DistanceFromHome                   1470 non-null   float64
 3   Education                          1470 non-null   float64
 4   EducationField                     1470 non-null   float64
 5   EmployeeCount                      1470 non-null   float64
 6   EmployeeNumber                     1470 non-null   float64
 7   EnvironmentSatisfaction            1470 non-null   float64
 8   HourlyRate                         1470 non-null   float64
 9   JobInvolvement                     1470 non-null   float64
 10  JobLevel                           1470 non-null   float64
 11  JobRole                            1470 non-null   float

In [89]:
df.head(10)

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,Department_Research_&_Development,Department_Sales,Gender_Female,Gender_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0.547619,0.71582,0.0,0.25,0.2,0.0,0.0,0.333333,0.914286,0.666667,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.738095,0.1267,0.25,0.0,0.2,0.0,0.000484,0.666667,0.442857,0.333333,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.452381,0.909807,0.035714,0.25,0.8,0.0,0.001451,1.0,0.885714,0.333333,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.357143,0.923407,0.071429,0.75,0.2,0.0,0.001935,1.0,0.371429,0.666667,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.214286,0.350036,0.035714,0.0,0.6,0.0,0.002903,0.0,0.142857,0.666667,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
5,0.333333,0.646385,0.035714,0.25,0.2,0.0,0.003387,1.0,0.7,0.666667,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
6,0.97619,0.874732,0.071429,0.5,0.6,0.0,0.004354,0.666667,0.728571,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7,0.285714,0.899069,0.821429,0.0,0.2,0.0,0.004838,1.0,0.528571,0.666667,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
8,0.47619,0.081603,0.785714,0.5,0.2,0.0,0.005322,1.0,0.2,0.333333,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
9,0.428571,0.856836,0.928571,0.5,0.6,0.0,0.005806,0.666667,0.914286,0.666667,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


# Import algorithms

In [90]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Import metrics

In [91]:
from sklearn.metrics import r2_score, mean_absolute_error

# Train/Test split

In [92]:
from sklearn.model_selection import train_test_split

x = df.drop('MonthlyIncome', axis=1)
y = df['MonthlyIncome']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Linear Regression

In [93]:
lr = LinearRegression()

lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

In [94]:
lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

In [95]:
lr_score

0.89565173057562

In [96]:
lr_mae

1165.508960570721

# Decision Tree

In [97]:
dt = DecisionTreeRegressor(random_state=42)

dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

In [98]:
dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [99]:
dt_score

0.8842922482630299

In [100]:
dt_mae

1122.171827831337

# Random Forest

In [101]:
rf = RandomForestRegressor(random_state=42)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

In [102]:
rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [103]:
rf_score

0.9294338393465874

In [104]:
rf_mae

919.8511486025903

# SVM

In [105]:
svc = SVR(kernel='linear', C=0.5)

svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

In [106]:
svc_score = r2_score(y_test, y_pred)
svc_mae = mean_absolute_error(y_test, y_pred)

In [107]:
svc_score

-0.2040860579495889

In [108]:
svc_mae

3336.6535369453945

# KNN

In [109]:
knn = KNeighborsRegressor(n_neighbors=3)

knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

In [110]:
knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [111]:
knn_score

0.30915807785708327

In [112]:
knn_mae

2692.069121581829

# Tabulate

In [113]:
from tabulate import tabulate

In [114]:
result = [
    ['Linear Regression', lr_score, lr_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['SVM', svc_score, svc_mae],
    ['KNN', knn_score, knn_mae],
]

headers = ['Algorithm', 'r2_score', 'mean absolute error']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.2f')

In [115]:
print(table)

+-------------------+------------+-----------------------+
| Algorithm         |   r2_score |   mean absolute error |
| Linear Regression |       0.90 |               1165.51 |
+-------------------+------------+-----------------------+
| Decision Tree     |       0.88 |               1122.17 |
+-------------------+------------+-----------------------+
| Random Forest     |       0.93 |                919.85 |
+-------------------+------------+-----------------------+
| SVM               |      -0.20 |               3336.65 |
+-------------------+------------+-----------------------+
| KNN               |       0.31 |               2692.07 |
+-------------------+------------+-----------------------+
