In [2]:
import os
import numpy as np
import pandas as pd
import warnings 

warnings.filterwarnings('ignore')

In [5]:
train_df = pd.read_csv('train.csv', parse_dates=['Date of Joining'])
train_df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.20
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52
...,...,...,...,...,...,...,...,...,...
22745,fffe31003500370039003100,2008-12-30,Female,Service,No,1.0,3.0,,0.41
22746,fffe33003000350031003800,2008-01-19,Female,Product,Yes,3.0,6.0,6.7,0.59
22747,fffe390032003000,2008-11-05,Male,Service,Yes,3.0,7.0,,0.72
22748,fffe33003300320036003900,2008-01-10,Female,Service,No,2.0,5.0,5.9,0.52


In [6]:
test_df = pd.read_csv('test.csv', parse_dates=['Date of Joining'])
test_df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score
0,fffe31003300390039003000,2008-12-10,Female,Service,No,2.0,5.0,7.7
1,fffe31003300310037003800,2008-08-14,Female,Product,Yes,1.0,2.0,5.2
2,fffe33003400380035003900,2008-11-13,Male,Product,Yes,1.0,3.0,5.9
3,fffe3100370039003200,2008-02-07,Female,Service,No,3.0,6.0,4.6
4,fffe32003600390036003700,2008-07-17,Female,Product,No,2.0,5.0,6.4
...,...,...,...,...,...,...,...,...
12245,fffe3900310034003700,2008-10-02,Female,Service,Yes,1.0,2.0,6.1
12246,fffe32003600330034003000,2008-03-31,Female,Product,Yes,2.0,4.0,5.9
12247,fffe31003800340039003000,2008-02-12,Male,Service,No,4.0,7.0,9.6
12248,fffe32003600380031003800,2008-02-06,Male,Service,No,3.0,6.0,6.7


In [7]:
# merge the train and test dataframes
df = pd.concat([train_df.assign(is_train=1), test_df.assign(is_train=0)], axis=0)
df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,is_train
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2.0,3.0,3.8,0.16,1
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1.0,2.0,5.0,0.36,1
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2.0,,5.8,0.49,1
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1.0,1.0,2.6,0.20,1
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3.0,7.0,6.9,0.52,1
...,...,...,...,...,...,...,...,...,...,...
12245,fffe3900310034003700,2008-10-02,Female,Service,Yes,1.0,2.0,6.1,,0
12246,fffe32003600330034003000,2008-03-31,Female,Product,Yes,2.0,4.0,5.9,,0
12247,fffe31003800340039003000,2008-02-12,Male,Service,No,4.0,7.0,9.6,,0
12248,fffe32003600380031003800,2008-02-06,Male,Service,No,3.0,6.0,6.7,,0


In [8]:
df.isna().sum()

Employee ID                 0
Date of Joining             0
Gender                      0
Company Type                0
WFH Setup Available         0
Designation                 0
Resource Allocation      1381
Mental Fatigue Score     2117
Burn Rate               13374
is_train                    0
dtype: int64

In [10]:
df["Resource Allocation"]

0        3.0
1        2.0
2        NaN
3        1.0
4        7.0
        ... 
12245    2.0
12246    4.0
12247    7.0
12248    6.0
12249    2.0
Name: Resource Allocation, Length: 35000, dtype: float64

In [11]:
r_col = "Resource Allocation"
df[r_col] = df[r_col].fillna(df[r_col].mean())

In [13]:
m_col = "Mental Fatigue Score"
df[m_col]

0        3.8
1        5.0
2        5.8
3        2.6
4        6.9
        ... 
12245    6.1
12246    5.9
12247    9.6
12248    6.7
12249    2.0
Name: Mental Fatigue Score, Length: 35000, dtype: float64

In [14]:
df[m_col] = df[m_col].fillna(df[m_col].mean())

In [16]:
df.dtypes

Employee ID                     object
Date of Joining         datetime64[ns]
Gender                          object
Company Type                    object
WFH Setup Available             object
Designation                    float64
Resource Allocation            float64
Mental Fatigue Score           float64
Burn Rate                      float64
is_train                         int64
dtype: object

In [18]:
cat_cols = df.columns[df.dtypes == 'object'][1:]
cat_cols

Index(['Gender', 'Company Type', 'WFH Setup Available'], dtype='object')

In [19]:
for col in cat_cols:
    df[col] = df[col].astype('category')

In [20]:
for cols in cat_cols:
    print(cols, df[cols].nunique())

Gender 2
Company Type 2
WFH Setup Available 2


In [21]:
for col in cat_cols:
    df[col] = df[col].cat.codes

df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,is_train
0,fffe32003000360033003200,2008-09-30,0,1,0,2.0,3.000000,3.8,0.16,1
1,fffe3700360033003500,2008-11-30,1,1,1,1.0,2.000000,5.0,0.36,1
2,fffe31003300320037003900,2008-03-10,0,0,1,2.0,4.473185,5.8,0.49,1
3,fffe32003400380032003900,2008-11-03,1,1,1,1.0,1.000000,2.6,0.20,1
4,fffe31003900340031003600,2008-07-24,0,1,0,3.0,7.000000,6.9,0.52,1
...,...,...,...,...,...,...,...,...,...,...
12245,fffe3900310034003700,2008-10-02,0,1,1,1.0,2.000000,6.1,,0
12246,fffe32003600330034003000,2008-03-31,0,0,1,2.0,4.000000,5.9,,0
12247,fffe31003800340039003000,2008-02-12,1,1,0,4.0,7.000000,9.6,,0
12248,fffe32003600380031003800,2008-02-06,1,1,0,3.0,6.000000,6.7,,0


In [22]:
doj = "Date of Joining"
df["year of joining"] = pd.to_datetime(df[doj]).dt.year
df["month of joining"] = pd.to_datetime(df[doj]).dt.month
df["day of joining"] = pd.to_datetime(df[doj]).dt.day

df.drop(["Date of Joining"], axis=1, inplace=True)

df

Unnamed: 0,Employee ID,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,is_train,year of joining,month of joining,day of joining
0,fffe32003000360033003200,0,1,0,2.0,3.000000,3.8,0.16,1,2008,9,30
1,fffe3700360033003500,1,1,1,1.0,2.000000,5.0,0.36,1,2008,11,30
2,fffe31003300320037003900,0,0,1,2.0,4.473185,5.8,0.49,1,2008,3,10
3,fffe32003400380032003900,1,1,1,1.0,1.000000,2.6,0.20,1,2008,11,3
4,fffe31003900340031003600,0,1,0,3.0,7.000000,6.9,0.52,1,2008,7,24
...,...,...,...,...,...,...,...,...,...,...,...,...
12245,fffe3900310034003700,0,1,1,1.0,2.000000,6.1,,0,2008,10,2
12246,fffe32003600330034003000,0,0,1,2.0,4.000000,5.9,,0,2008,3,31
12247,fffe31003800340039003000,1,1,0,4.0,7.000000,9.6,,0,2008,2,12
12248,fffe32003600380031003800,1,1,0,3.0,6.000000,6.7,,0,2008,2,6


In [23]:
df.isna().sum()

Employee ID                 0
Gender                      0
Company Type                0
WFH Setup Available         0
Designation                 0
Resource Allocation         0
Mental Fatigue Score        0
Burn Rate               13374
is_train                    0
year of joining             0
month of joining            0
day of joining              0
dtype: int64

In [24]:
train_df, test_df = df[df.is_train == 1], df[df.is_train == 0]


In [26]:
in_cols, ycol = train_df.drop(["is_train", "Burn Rate"], axis=1).columns[1:], "Burn Rate"
in_cols

Index(['Gender', 'Company Type', 'WFH Setup Available', 'Designation',
       'Resource Allocation', 'Mental Fatigue Score', 'year of joining',
       'month of joining', 'day of joining'],
      dtype='object')

In [1]:
# r2_score
import numpy as np
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier

class ModelEvalPredReg:
    def __init__(self, train_df, test_df, in_cols, target):
        self.train_df = train_df
        self.test_df = test_df
        self.in_cols = in_cols 
        self.target = target

    def eval_model(self, models, weights=None, show=True, shuf=False):
        kf = KFold(n_splits=5, shuffle=shuf)
        scores = []
        for train, test in kf.split(self.train_df):
            results = []
            for model in models:
                model.fit(self.train_df[self.in_cols].iloc[train], self.train_df[self.target].iloc[train])
                results.append(model.predict(self.train_df[self.in_cols].iloc[test]))
            if weights is None:
                rmse = sqrt(mean_squared_error(self.train_df[self.target].iloc[test], np.mean(results, axis=0)))
            else:
                rmse = sqrt(mean_squared_error(self.train_df[self.target].iloc[test], np.average(results, axis=0, weights=weights)))
            scores.append(rmse)
            if show:
                print(rmse)
        if show:
            print("Average score in 5-fold CV:", np.mean(scores))
        return np.mean(scores)
    
    def predict(self, models):
        preds = []
        for model in models:
            model.fit(self.train_df[self.in_cols], self.train_df[self.target])
            preds.append(model.predict(self.test_df[self.in_cols]))
        return np.mean(preds, axis=0)