In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)

In [3]:
train = pd.read_csv('data/train.csv')

In [4]:
train.head()

Unnamed: 0,ID,quarter,department,day,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,TRAIN_0000,Quarter1,sweing,Thursday,0.8,26.16,1108,0.27315,98,0,0,0,59,0.94073
1,TRAIN_0001,Quarter1,finishing,Thursday,0.75,3.94,1190,0.03704,0,0,0,0,8,0.8865
2,TRAIN_0002,Quarter1,sweing,Thursday,0.8,11.41,968,0.1412,50,0,0,0,30,0.80057
3,TRAIN_0003,Quarter1,sweing,Thursday,0.8,11.41,968,0.1412,50,0,0,0,30,0.80057
4,TRAIN_0004,Quarter1,sweing,Thursday,0.8,25.9,1170,0.07407,50,0,0,0,56,0.80038


In [5]:
print(train.isna().sum())

ID                       0
quarter                  0
department               0
day                      0
targeted_productivity    0
smv                      0
wip                      0
over_time                0
incentive                0
idle_time                0
idle_men                 0
no_of_style_change       0
no_of_workers            0
actual_productivity      0
dtype: int64


In [6]:
# def NMAE(true, pred):
#     mae = np.mean(np.abs(true-pred))
#     score = mae / np.mean(np.abs(true))
#     return score


In [6]:
test = pd.read_csv('data/test.csv')
test = test.drop(columns=['ID'])

In [7]:
X = train.drop(columns=['ID', 'actual_productivity'])
y = train['actual_productivity']

## pre-processing

In [8]:
train_wip_mean = np.mean(X['wip'])
X = X.replace({'wip':np.nan}, train_wip_mean)
test = test.replace({'wip':np.nan}, train_wip_mean)

qual_col = ['quarter', 'department', 'day']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(X[i])
    X[i] = le.transform(X[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
            
    test[i] = le.transform(test[i])
    
print('Done.')

Done.


In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()

vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif

Unnamed: 0,VIF,features
0,2.368792,quarter
1,20.198441,department
2,3.151749,day
3,7.723758,targeted_productivity
4,17.644401,smv
5,1.765679,wip
6,6.975149,over_time
7,1.084036,incentive
8,1.481304,idle_time
9,1.544001,idle_men


In [10]:
X.drop(['department', 'smv', 'no_of_workers'], axis=1, inplace=True)
test.drop(['department', 'smv', 'no_of_workers'], axis=1, inplace=True)

## Regression Model fit

In [11]:
def NMAE(y, pred):
    mae = mean_absolute_error(y, pred)
    score = mae / np.mean(np.abs(y))
    return score

In [12]:
cat = CatBoostRegressor().fit(X, y)
print('Done.')

Learning rate set to 0.042123
0:	learn: 0.1727190	total: 172ms	remaining: 2m 51s
1:	learn: 0.1712204	total: 176ms	remaining: 1m 27s
2:	learn: 0.1692236	total: 180ms	remaining: 59.7s
3:	learn: 0.1679367	total: 181ms	remaining: 45.1s
4:	learn: 0.1665221	total: 183ms	remaining: 36.5s
5:	learn: 0.1651040	total: 185ms	remaining: 30.7s
6:	learn: 0.1638831	total: 187ms	remaining: 26.5s
7:	learn: 0.1624551	total: 189ms	remaining: 23.4s
8:	learn: 0.1610933	total: 191ms	remaining: 21s
9:	learn: 0.1598603	total: 193ms	remaining: 19.1s
10:	learn: 0.1585132	total: 196ms	remaining: 17.6s
11:	learn: 0.1575849	total: 198ms	remaining: 16.3s
12:	learn: 0.1564698	total: 200ms	remaining: 15.2s
13:	learn: 0.1556492	total: 202ms	remaining: 14.2s
14:	learn: 0.1546960	total: 204ms	remaining: 13.4s
15:	learn: 0.1538179	total: 205ms	remaining: 12.6s
16:	learn: 0.1530258	total: 207ms	remaining: 12s
17:	learn: 0.1522151	total: 208ms	remaining: 11.4s
18:	learn: 0.1514072	total: 210ms	remaining: 10.8s
19:	learn: 0.

In [13]:
preds = cat.predict(test)
print('Done.')

Done.


In [14]:
error = NMAE(y, preds)
error

ValueError: Found input variables with inconsistent numbers of samples: [1197, 818]

## Submit

In [15]:
submit = pd.read_csv('data/sample_submission.csv')

submit

Unnamed: 0,ID,actual_productivity
0,TEST_0000,0.0
1,TEST_0001,0.0
2,TEST_0002,0.0
3,TEST_0003,0.0
4,TEST_0004,0.0
...,...,...
813,TEST_0813,0.0
814,TEST_0814,0.0
815,TEST_0815,0.0
816,TEST_0816,0.0


In [16]:
submit['actual_productivity'] = preds
submit

Unnamed: 0,ID,actual_productivity
0,TEST_0000,0.353045
1,TEST_0001,0.559936
2,TEST_0002,0.395836
3,TEST_0003,0.353045
4,TEST_0004,0.711077
...,...,...
813,TEST_0813,0.609931
814,TEST_0814,0.828910
815,TEST_0815,0.424428
816,TEST_0816,0.326178


In [17]:
submit.to_csv('result/vif.csv', index=False)