In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)

In [3]:
train = pd.read_csv('data/train.csv')

In [4]:
train.head()

Unnamed: 0,ID,quarter,department,day,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,TRAIN_0000,Quarter1,sweing,Thursday,0.8,26.16,1108,0.27315,98,0,0,0,59,0.94073
1,TRAIN_0001,Quarter1,finishing,Thursday,0.75,3.94,1190,0.03704,0,0,0,0,8,0.8865
2,TRAIN_0002,Quarter1,sweing,Thursday,0.8,11.41,968,0.1412,50,0,0,0,30,0.80057
3,TRAIN_0003,Quarter1,sweing,Thursday,0.8,11.41,968,0.1412,50,0,0,0,30,0.80057
4,TRAIN_0004,Quarter1,sweing,Thursday,0.8,25.9,1170,0.07407,50,0,0,0,56,0.80038


In [6]:
# def NMAE(true, pred):
#     mae = np.mean(np.abs(true-pred))
#     score = mae / np.mean(np.abs(true))
#     return score


In [5]:
X = train.drop(columns=['ID', 'actual_productivity'])
y = train['actual_productivity']

In [5]:
test = pd.read_csv('data/test.csv')
test = test.drop(columns=['ID'])

In [6]:
from pycaret.regression import *
# from sklearn.impute import SimpleImputer

# xp_reg = setup(data=train, target='actual_productivity', session_id=123,
#                normalize=True, transformation=True, transform_target=True,
#                combine_rare_levels=True, rare_level_threshold=0.05,
#                remove_multicollinearity=True, multicollinearity_threshold=0.95,
#                bin_numeric_features = ['C
reg = setup(data=train, target='actual_productivity', session_id=123, train_size=0.8,
            normalize=True, transformation=True, fold=5, fold_shuffle=True, ignore_features=['ID'],
           numeric_features=['targeted_productivity', 'smv', 'wip', 'over_time', 'incentive',
                             'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers'])

Unnamed: 0,Description,Value
0,session_id,123
1,Target,actual_productivity
2,Original Data,"(1197, 14)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(957, 20)"


AttributeError: 'Simple_Imputer' object has no attribute 'fill_value_categorical'

In [7]:
print(train.isna().sum())

ID                       0
quarter                  0
department               0
day                      0
targeted_productivity    0
smv                      0
wip                      0
over_time                0
incentive                0
idle_time                0
idle_men                 0
no_of_style_change       0
no_of_workers            0
actual_productivity      0
dtype: int64


## pre-processing

In [8]:
train_wip_mean = np.mean(X['wip'])
X = X.replace({'wip':np.nan}, train_wip_mean)
test = test.replace({'wip':np.nan}, train_wip_mean)

qual_col = ['quarter', 'department', 'day']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(X[i])
    X[i] = le.transform(X[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
            
    test[i] = le.transform(test[i])
    
print('Done.')

Done.


## Regression Model fit

In [16]:
# def NMAE(y, pred):
#     mae = mean_absolute_error(y, pred)
#     score = mae / np.mean(np.abs(y))
#     return score

In [9]:
cat = CatBoostRegressor().fit(X, y)
print('Done.')

Learning rate set to 0.042123
0:	learn: 0.1719301	total: 157ms	remaining: 2m 37s
1:	learn: 0.1694979	total: 164ms	remaining: 1m 21s
2:	learn: 0.1670158	total: 170ms	remaining: 56.4s
3:	learn: 0.1649026	total: 174ms	remaining: 43.4s
4:	learn: 0.1629229	total: 177ms	remaining: 35.3s
5:	learn: 0.1611102	total: 180ms	remaining: 29.9s
6:	learn: 0.1592600	total: 183ms	remaining: 26s
7:	learn: 0.1571218	total: 187ms	remaining: 23.2s
8:	learn: 0.1554740	total: 191ms	remaining: 21s
9:	learn: 0.1537710	total: 194ms	remaining: 19.2s
10:	learn: 0.1523797	total: 197ms	remaining: 17.7s
11:	learn: 0.1509320	total: 201ms	remaining: 16.6s
12:	learn: 0.1498347	total: 204ms	remaining: 15.5s
13:	learn: 0.1486758	total: 205ms	remaining: 14.5s
14:	learn: 0.1477943	total: 207ms	remaining: 13.6s
15:	learn: 0.1468121	total: 208ms	remaining: 12.8s
16:	learn: 0.1457206	total: 210ms	remaining: 12.1s
17:	learn: 0.1444843	total: 211ms	remaining: 11.5s
18:	learn: 0.1434892	total: 213ms	remaining: 11s
19:	learn: 0.14

In [11]:
preds = cat.predict(test)
print('Done.')

Done.


In [17]:
# error = NMAE(y, preds)
# error

ValueError: Found input variables with inconsistent numbers of samples: [1197, 818]

## Submit

In [12]:
submit = pd.read_csv('data/sample_submission.csv')

submit

Unnamed: 0,ID,actual_productivity
0,TEST_0000,0.0
1,TEST_0001,0.0
2,TEST_0002,0.0
3,TEST_0003,0.0
4,TEST_0004,0.0
...,...,...
813,TEST_0813,0.0
814,TEST_0814,0.0
815,TEST_0815,0.0
816,TEST_0816,0.0


In [13]:
submit['actual_productivity'] = preds

In [14]:
submit.to_csv('result/catboost.csv', index=False)