# HR dataset Machine Learning Modeling

This part is for modeling machine learning algorithms in dataset.<br>
Dataset is splited in 2 parts from data analysis part.<br>
- one part is with encoding categorical features in pandas.
- second part is without categorical features and we want to solve it with sklearn library.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [2]:
# load data sets
encoded_train = pd.read_csv('dataset/train_hr_data_cleaned_v1.csv')
encoded_test = pd.read_csv('dataset/test_hr_data_cleaned_v1.csv')

nonen_train = pd.read_csv('dataset/train_hr_data_cleaned_v2.csv')
nonen_test = pd.read_csv('dataset/test_hr_data_cleaned_v2.csv')

In [3]:
encoded_train = encoded_train.drop('enrollee_id', axis=1)
nonen_train = nonen_train.drop('enrollee_id', axis=1)

In [4]:
display(encoded_train.head(1))
display(nonen_train.head(1))

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,77,0.776,0,1,0,0,0,2,1,2,1,47,0.0


Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0


# Modeling V1 Dataset

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [6]:
x_v1 = encoded_train.drop('target', axis=1)
y_v1 = encoded_train['target']

In [7]:
scale = StandardScaler()
x_v1_scale = scale.fit_transform(x_v1)

In [8]:
xtrain_v1, xtest_v1, ytrain_v1, ytest_v1 = train_test_split(x_v1_scale, y_v1, test_size=0.2, random_state=42)

In [9]:
reg_model = LogisticRegression()
tree_model = DecisionTreeClassifier()
forest_model = RandomForestClassifier()

In [10]:
# function for predict and caclute mse of models
def modeling(model, xtrain, ytrain, xtest, ytest):
    model.fit(xtrain, ytrain)
    train_pred = model.predict(xtrain)
    test_pred = model.predict(xtest)
    train_error = np.sqrt(mean_squared_error(ytrain, train_pred))
    test_error = np.sqrt(mean_squared_error(ytest, test_pred))
    return train_error, test_error

In [11]:
# tre = train error , tee = test error
reg_tre, reg_tee = modeling(reg_model, xtrain_v1, ytrain_v1, xtest_v1, ytest_v1)
tree_tre, tree_tee = modeling(tree_model, xtrain_v1, ytrain_v1, xtest_v1, ytest_v1)
forest_tre, forest_tee = modeling(forest_model, xtrain_v1, ytrain_v1, xtest_v1, ytest_v1)

# Modeling V2 Dataset

In [12]:
x_v2 = nonen_train.drop('target', axis=1)
y_v2 = nonen_train['target']

num_data = x_v2.select_dtypes(exclude='object')
cat_data = x_v2.select_dtypes(include='object')

num_data_list = list(num_data)
cat_data_list = list(cat_data)

In [13]:
transformer = ColumnTransformer([
                                ('numbers', StandardScaler(), num_data_list),
                                ('categories', OneHotEncoder(), cat_data_list)
])

In [14]:
x_v2_scale = transformer.fit_transform(x_v2)

In [15]:
xtrain_v2, xtest_v2, ytrain_v2, ytest_v2 = train_test_split(x_v2_scale, y_v2, test_size=0.2, random_state=42)

In [16]:
reg_model_2 = LogisticRegression()
tree_model_2 = DecisionTreeClassifier()
forest_model_2 = RandomForestClassifier()

In [17]:
reg_tre_2, reg_tee_2 = modeling(reg_model_2, xtrain_v2, ytrain_v2, xtest_v2, ytest_v2)
tree_tre_2, tree_tee_2 = modeling(tree_model_2, xtrain_v2, ytrain_v2, xtest_v2, ytest_v2)
forest_tre_2, forest_tee_2 = modeling(forest_model_2, xtrain_v2, ytrain_v2, xtest_v2, ytest_v2)

In [18]:
compare_df = pd.DataFrame({
                            'Models':['regression', 'decision tree', 'random forest',
                                     'regression_2', 'decision tree_2', 'random forest_2'],
                            'train error':[reg_tre, tree_tre, forest_tre,
                                          reg_tre_2, tree_tre_2, forest_tre_2],
                            'test error':[reg_tee, tree_tee, forest_tee,
                                         reg_tee_2, tree_tee_2, forest_tee_2],
})

In [19]:
#cm = sns.set_palette('Set2')
cm = sns.light_palette('green', as_cmap=True)
compare_df.style.background_gradient(cmap=cm).highlight_min(axis=0, color='yellow')

Unnamed: 0,Models,train error,test error
0,regression,0.474061,0.476026
1,decision tree,0.147426,0.545257
2,random forest,0.147426,0.492188
3,regression_2,0.457496,0.462127
4,decision tree_2,0.029485,0.548436
5,random forest_2,0.029485,0.479363


### not finished yet