# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Load Data

In [2]:
Train_Data=pd.read_csv("/Users/chohan/Desktop/ML Problem/Datasets/d4c937c412c011eb/dataset/train.csv",delimiter=',')
Test_Data=pd.read_csv("/Users/chohan/Desktop/ML Problem/Datasets/d4c937c412c011eb/dataset/test.csv",delimiter=',')


# Data Analysis

In [3]:
Train_Data.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52


In [4]:
Train_Data=Train_Data.dropna()

In [5]:
Train_Data.dtypes

Employee ID              object
Date of Joining          object
Gender                   object
Company Type             object
WFH Setup Available      object
Designation             float64
Resource Allocation     float64
Mental Fatigue Score    float64
Burn Rate               float64
dtype: object

In [6]:
Train_Data.describe()

Unnamed: 0,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
count,18590.0,18590.0,18590.0,18590.0
mean,2.179398,4.486552,5.732173,0.452444
std,1.133148,2.044848,1.920547,0.197848
min,0.0,1.0,0.0,0.0
25%,1.0,3.0,4.6,0.32
50%,2.0,4.0,5.9,0.45
75%,3.0,6.0,7.1,0.59
max,5.0,10.0,10.0,1.0


In [7]:
Train_Data.columns

Index(['Employee ID', 'Date of Joining', 'Gender', 'Company Type',
       'WFH Setup Available', 'Designation', 'Resource Allocation',
       'Mental Fatigue Score', 'Burn Rate'],
      dtype='object')

# Data PreProcessing

In [8]:
X=Train_Data.iloc[:,0:8].values
Y=Train_Data.iloc[:,-1].values

In [9]:
from sklearn.preprocessing import StandardScaler,LabelEncoder
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0]) # 0:female 1:male
X[:, 1] = labelencoder.fit_transform(X[:, 1]) #0:product 1:service
X[:, 2] = labelencoder.fit_transform(X[:, 2]) #0:No 1:Yes
X[:, 3] = labelencoder.fit_transform(X[:, 3]) #0:No 1:Yes
X[:, 4] = labelencoder.fit_transform(X[:, 4]) #0:No 1:Yes



In [10]:
print(len(X),np.shape(X))
print(len(Y),np.shape(Y))
print(len(Train_Data),np.shape(Train_Data))

18590 (18590, 8)
18590 (18590,)
18590 (18590, 9)


# Split Data into Train test

In [11]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0,test_size=0.2,shuffle=True)
print(np.shape(x_train),np.shape(x_test))
print(np.shape(y_train),np.shape(y_test))

(14872, 8) (3718, 8)
(14872,) (3718,)


# Design A Model (Linear Regression)

In [12]:
model=LinearRegression()


# Train The Model

In [13]:
model.fit(x_train,y_train)

LinearRegression()

# Predict The Value

In [14]:
y_pred=model.predict(x_test)

In [15]:
y_pred

array([ 0.59823647,  0.64214309,  0.49478647, ...,  0.23490316,
        0.32989528, -0.02460305])

In [16]:
from sklearn.metrics import mean_squared_error,r2_score
print("Training_Accuracy:",model.score(x_train,y_train)*100)
print("Testing_Accuracy:",model.score(x_test,y_test)*100)
print("Model_Accuracy:",r2_score(Y,model.predict(X))*100)

Training_Accuracy: 92.10881238865802
Testing_Accuracy: 91.88950001851262
Model_Accuracy: 92.06626265268272


# Test Model On New Data

In [17]:
Test_Data.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score
0,fffe31003300390039003000,2008-12-10,Female,Service,No,2.0,5.0,7.7
1,fffe31003300310037003800,2008-08-14,Female,Product,Yes,1.0,2.0,5.2
2,fffe33003400380035003900,2008-11-13,Male,Product,Yes,1.0,3.0,5.9
3,fffe3100370039003200,2008-02-07,Female,Service,No,3.0,6.0,4.6
4,fffe32003600390036003700,2008-07-17,Female,Product,No,2.0,5.0,6.4


In [18]:
x=Test_Data.iloc[:,0:8].values
x

array([['fffe31003300390039003000', '2008-12-10', 'Female', ..., 2.0,
        5.0, 7.7],
       ['fffe31003300310037003800', '2008-08-14', 'Female', ..., 1.0,
        2.0, 5.2],
       ['fffe33003400380035003900', '2008-11-13', 'Male', ..., 1.0, 3.0,
        5.9],
       ...,
       ['fffe31003800340039003000', '2008-02-12', 'Male', ..., 4.0, 7.0,
        9.6],
       ['fffe32003600380031003800', '2008-02-06', 'Male', ..., 3.0, 6.0,
        6.7],
       ['fffe32003100390037003800', '2008-08-05', 'Female', ..., 2.0,
        2.0, 2.0]], dtype=object)

In [19]:
labelencoder = LabelEncoder()
x[:, 0] = labelencoder.fit_transform(x[:, 0]) # 0:female 1:male
x[:, 1] = labelencoder.fit_transform(x[:, 1]) #0:product 1:service
x[:, 2] = labelencoder.fit_transform(x[:, 2]) #0:No 1:Yes
x[:, 3] = labelencoder.fit_transform(x[:, 3]) #0:No 1:Yes
x[:, 4] = labelencoder.fit_transform(x[:, 4]) #0:No 1:Yes

In [20]:
test_pred=model.predict(x)

In [21]:
for i in range(len(test_pred)):
    print(test_pred[i])


0.6172928989141535
0.3386211043363437
0.42390066005486654
0.41184995663282914
0.5201004067770996
0.5275042642717584
0.32750903044352475
0.6810836871729677
0.601465977125855
0.46084344121985904
0.42593239198139893
0.3620997788657722
0.3939900379753689
0.5972345739173512
0.6133752580019585
0.5589932887700173
0.5936573073500779
0.774033493606326
0.2580151465481542
0.4222356886100234
0.37143093481858375
0.16691518044698012
0.46764475878192185
0.6319785467909451
0.5398364803790356
0.4040768909907791
0.4782978198855335
0.34335636346262743
0.34821975845574704
0.3423566213961736
0.3961534093860109
0.862242208971518
0.3702875638721737
-0.0678158958780519
0.1386713159340761
0.6403847020023419
0.3493169046957993
0.3899331450343796
0.41887899156910874
0.3227122851423101
0.6012644386655526
0.24463083962371818
0.8403701756652924
0.7089293088565258
0.312332636070018
0.23909368761146838
0.6350802402547098
0.30799004655097356
0.35815711077771545
0.5486676361110501
0.3383836435582608
0.3643083972422213


0.3002007223265724
0.6194045632994094
0.37097222540562064
0.6698607984458325
0.5152842356366145
0.34719716180589283
0.32046483882951177
0.7837117248951173
0.5067121787678652
-0.009495845270067055
0.5898950056075396
0.5924134889454264
0.5272740266309712
0.48925045237980574
0.40094151578814363
0.3493956378998761
0.4927840533697995
0.33117892477045946
0.4596683884490957
0.5880900714552448
0.5029315374102614
0.0897442265057058
0.8685937506786456
0.3509976538721393
0.5307922307325694
0.6374508678321538
0.13991378892284548
0.38102791905852956
0.34004365082496185
0.4549194698919576
0.3990737792910877
0.4736061833634015
0.41600334834400604
0.41401127117658415
0.4827822958313304
0.4750921503127559
0.5734275577049759
0.4536228772598427
0.7109442772554826
0.4713223429271543
0.537746184863028
0.878879384337572
0.5955442228303198
0.36792441156853095
0.45747688159910443
0.312158887746853
0.3923529487134087
0.7576503505290679
0.586418236489912
0.22502335043160465
0.18029404366737817
0.616460447651974

0.6063499466744076
0.6585674454985186
0.21523827551260083
0.668168252069552
0.5172070217164428
0.6555529762094332
0.4265238526154345
0.3598027127468242
0.7724265329034326
0.09326859183037825
0.4980202915497132
0.6073443614259679
0.7297404139353625
0.29318210605266
0.45316936159540716
0.5854107303040947
0.48931688957258773
0.5781941143885398
0.6621936168928233
0.38913830708730407
0.6365540050667244
0.00822783705024245
0.5588663035744983
0.4020400940753251
0.2865031462122754
0.4137949817823406
0.21451367731240106
0.38574999602894616
0.6609783306654211
0.7459515752313375
0.5999068926853393
0.40737708301854386
0.5588402053128292
0.33826115593137845
0.5531337682857997
0.42766125109668135
0.7633760311674074
0.6099440266437726
0.33276946543069236
0.3665997387067543
0.3567151146514998
0.06292650888173545
0.4520282712510338
0.7850304590312134
0.31967179162870774
0.6593125605070644
0.5749409279241258
0.28116495072802433
0.5958396528561212
0.1744170516370082
0.5819112436294123
0.37332127313057045

0.28204810155801013
0.3647912192115454
0.7708323837776523
0.45783992669236656
0.45185534179399695
0.4239598567908876
0.37353448037357
0.24721198919736204
0.601262252933515
0.5297399606889166
0.5663224855108586
0.2787080959146335
0.4999947771949645
0.37625029612102506
0.5240997912053373
0.39122238547476546
0.44014033499094624
0.5165041866409714
0.6695584442689888
0.06671152782124887
0.22677260704543395
0.12443055866226943
0.23383676101043221
0.3942768257864705
0.23779892831084576
0.4329654587144279
0.48715066775145727
0.5424323102373905
0.6668608516550303
0.4416865437676664
0.7184129515107216
0.2619885174505266
0.6592482805196986
0.48103969644567396
0.201344372655766
0.4980343821597309
0.44355239021610976
0.2781685483206099
0.2604569693442032
0.6395370944314311
0.49040659900418887
0.3443682228498386
0.42682541697233206
0.48702504384993967
0.5038019248952195
0.3672566662251308
0.8964784551136746
0.5421695811982679
0.46027720515186116
0.7294841312062414
0.380225850979779
0.527563189756196

0.5810642229247291
0.22049926778502843
0.839309770392904
0.3285051531807467
0.6065308827220546
0.6642612159480499
0.34328614593029266
0.45041408921184395
0.4474195720110457
0.48515678264721523
0.5667285595766827
0.47704998270057447
0.7058989595081182
0.5943070808229507
0.36465925665219506
0.5571695023682783
0.4381087791319916
0.6671916159835727
0.6072976502152481
0.6471711637903113
0.5217792108662302
0.39109840440092997
0.12316234749680563
0.12121466714508222
0.5992096113750458
0.6941873503067089
0.5258097599139533
0.8980149786749703
0.659352294288561
0.45869882220634356
0.4948389608141542
0.7542657716902212
0.6582891737600993
0.6877895886125227
0.46501721154199654
0.6856319551375851
0.34967180400788345
0.19871129592329423
0.6324078442456609
0.6916880875494118
0.5006466389074591
0.8703774046111057
0.07989584655393817
0.1689716547120802
0.10216422886988288
0.7203424686321556
0.6561257242454355
-0.0061313157445612165
0.6283064437900479
0.6415394234939313
0.39177823590929867
-0.0682760971

# Save Data Into DataFrame

In [22]:
new_data=pd.DataFrame({"Employee ID":Test_Data["Employee ID"],"Burn Rate":test_pred})

In [23]:
new_data.to_csv("Result.csv")

# Model Desinging(RandomForest Regressor)

In [24]:
from sklearn.ensemble import RandomForestRegressor
model1=RandomForestRegressor()
model1.fit(x_train,y_train)

RandomForestRegressor()

In [25]:
new_predict=model1.predict(x_test)

In [26]:
new_predict

array([0.6162, 0.6608, 0.4832, ..., 0.2428, 0.3062, 0.0168])

In [27]:
from sklearn.metrics import mean_squared_error,r2_score
print("Training_Accuracy:",model1.score(x_train,y_train)*100)
print("Testing_Accuracy:",model1.score(x_test,y_test)*100)
print("Model_Accuracy:",r2_score(Y,model1.predict(X))*100)

Training_Accuracy: 98.90923891519951
Testing_Accuracy: 91.99119033193335
Model_Accuracy: 97.56612923357862


In [28]:
new_test_pred=model1.predict(x)

In [29]:
new_data=pd.DataFrame({"Employee ID":Test_Data["Employee ID"],"Burn Rate":new_test_pred})
new_data.to_csv("new_Result.csv")