In [96]:
import numpy as np
import pandas as pd

In [97]:
df = pd.read_csv('data/train100k.csv')

In [98]:
def to_quarters(df):
    condlist = [
        ((1 <= df['month'].values) & (df['month'].values <= 3)),
        ((4 <= df['month'].values) & (df['month'].values <= 6)),
        ((7 <= df['month'].values) & (df['month'].values <= 9)),
        ((10 <= df['month'].values) & (df['month'].values <= 12))
    ]
    choicelist = [
        'Q1',
        'Q2',
        'Q3',
        'Q4'
    ]
    return np.select(condlist,choicelist)

In [99]:
df =(df.assign(
                date=lambda x: pd.to_datetime(x['date'], infer_datetime_format=True),
                day=lambda x: x['date'].dt.day,
                month=lambda x: x['date'].dt.month,
                year=lambda x: x['date'].dt.year,
                quarter=to_quarters,
                basket_size=lambda x: x['itemids'].str.split().str.len()
        )
        .drop(columns='itemids')
        .astype({
                'userid':'int32',
                'basket_size':'int16',
                'day':'int8',
                'month':'int8',
                'year':'int16',
                'quarter':'category'
        })
)

In [100]:
df

Unnamed: 0,userid,date,day,month,year,quarter,basket_size
0,7226385,2019-01-22,22,1,2019,Q1,4
1,7226385,2019-02-12,12,2,2019,Q1,3
2,7226385,2019-03-11,11,3,2019,Q1,4
3,7226385,2019-04-03,3,4,2019,Q2,5
4,7226385,2019-05-23,23,5,2019,Q2,4
...,...,...,...,...,...,...,...
1711872,1542695,2019-06-19,19,6,2019,Q2,6
1711873,1542695,2019-07-15,15,7,2019,Q3,7
1711874,1542695,2019-10-04,4,10,2019,Q4,4
1711875,1542695,2019-11-19,19,11,2019,Q4,5


In [101]:
print(sum(df.memory_usage(deep=True)), 'bytes')
df.memory_usage(deep=True)

30814842 bytes


Index               128
userid          6847508
date           13695016
day             1711877
month           1712273
year            1712001
quarter         1712285
basket_size     3423754
dtype: int64

In [102]:
from sklearn.model_selection import train_test_split

dataset = ['userid','day','month','year','quarter']
label = ['basket_size']

X = df[dataset]
y = df[label]
y = np.ravel(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(1198313, 4) (513564, 4)
(1198313,) (513564,)


In [None]:
from flaml import AutoML

settings = {
    #'max_iter':2,
    'time_budget':1200,
    'task':'regression',
    'estimator_list':['lgbm','catboost','rf'],
    'metric':'r2'
}

automl = AutoML()
automl.fit(X_train, y_train, **settings)

In [None]:
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)

from flaml.ml import sklearn_metric_loss_score
print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))
print('rmse', '=', sklearn_metric_loss_score('rmse', y_pred, y_test))

Predicted labels [4.86432688 5.96904438 4.60511815 ... 4.43818137 5.8993146  5.16513062]
True labels [ 5  4  2 ...  3 11  8]
r2 = 0.012534598354599913
mse = 13.11121822946392
mae = 2.6355840397290184
rmse = 3.620941621935366
