In [31]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from sklearn.model_selection import train_test_split

In [32]:
N_CUSTOMERS_REDUCED = 5000
USE_REDUCED = False
cat_vars =  ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68'] 

In [33]:
if USE_REDUCED:
    df = pd.read_csv('train_data_reduced.csv')
    df_lab = pd.read_csv('train_labels_reduced.csv')
    df = df.drop('Unnamed: 0', axis=1)
    df_lab = df_lab.drop('Unnamed: 0', axis=1)
else:
    df = pd.read_csv('/dev/shm/train_data.csv')
    df_lab = pd.read_csv('/dev/shm/train_labels.csv')
    
# TODO let's load this l8r
# df_test = pd.read_csv('/dev/shm/test_data.csv')

### Dataset reduction (fast prototyping!)

In [34]:
# This makes no sense if we're already operating
# on a reduced dataframe.
if not USE_REDUCED:
    unique_customers = df.customer_ID.unique()
    customer_selection = np.random.choice(unique_customers, size=N_CUSTOMERS_REDUCED, replace=False)

    df = df[df.customer_ID.isin(customer_selection)]
    df_lab = df_lab[df_lab.customer_ID.isin(customer_selection)]

    df.to_csv('train_data_reduced.csv')
    df_lab.to_csv('train_labels_reduced.csv')

## Data Exploration

In [35]:
print(f'Rows per customer: {df.shape[0] / df.customer_ID.unique().shape[0]}',)

Rows per customer: 12.0614


In [32]:
df_with_labels = df.set_index('customer_ID').join(df_lab.set_index('customer_ID'))

In [41]:
def preprocess(dataframe):
    cat_dtypes = dataframe.dtypes[cat_vars]
    dataframe[cat_vars] = dataframe[cat_vars].where(~dataframe[cat_vars].isna(), 0)

    dataframe = dataframe.astype({k: np.int32 for k in cat_dtypes[cat_dtypes == np.float64].keys()})
    #df_no_nan = df.replace(np.nan, "")
    #df_wide = dataframe.groupby('customer_ID').sum().unstack()

    df_reset = dataframe.reset_index()
    df_reset['group_index'] = dataframe.groupby('customer_ID').cumcount()
    
    df_reindex = df_reset.set_index(['customer_ID', 'group_index'])
    # Drop Duplicates (todo: why?)
    df_reindex = df_reindex[~df_reindex.index.duplicated(keep='last')]
    
    df_unstacked = df_reindex.unstack()

    df_unstacked = df_unstacked.drop('index', level=0, axis=1)
    df_unstacked = df_unstacked.drop('S_2', level=0, axis=1)

    df_unstacked.columns = [' '.join(map(str, col)).strip() for col in df_unstacked.columns.values]

    # Convert categorical features in wide dataframe to integers
    cat_vars_wide = list(filter(lambda x: any([y in x for y in cat_vars]), df_unstacked.columns))
    cat_dtypes = df_unstacked.dtypes[cat_vars_wide]

    df_unstacked = df_unstacked.astype({k: 'Int32' for k in cat_dtypes[cat_dtypes==np.float64].keys()})
    df_unstacked = df_unstacked.replace(np.nan, -1)
    cat_dtypes = df_unstacked.dtypes[cat_vars_wide]
    df_unstacked = df_unstacked.astype({k: np.int32 for k in cat_dtypes[cat_dtypes=='Int32'].keys()})
    
    # Additionally convert cateorical features that are strings to integers, so we have only numbers (needed for tabnet only)
    string_cols = df_unstacked.select_dtypes(include='object').columns
    df_unstacked = df_unstacked.astype({k: 'category' for k in string_cols})

    cols = df_unstacked.select_dtypes(include='category').columns
    n_cols = len(cols)
    for col in cols:
        df_unstacked[col] = df_unstacked[col].cat.codes

    preprocessed_np = df_unstacked.to_numpy()
    categorical_indices = [df_unstacked.columns.get_loc(x) for x in cat_vars_wide]
    
    return preprocessed_np, categorical_indices

### Tabnet

In [42]:
preprocessed_np, cat_idxs = preprocess(df)



In [43]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_np, df_lab['target'].to_numpy(), test_size=0.2)

clf = TabNetClassifier(  cat_idxs=cat_idxs)
clf.fit(
  X_train, y_train,
  eval_set=[(X_test, y_test)],
)

Device used : cuda
epoch 0  | loss: 0.74838 | val_0_auc: 0.41148 |  0:00:00s
epoch 1  | loss: 0.69521 | val_0_auc: 0.29339 |  0:00:00s
epoch 2  | loss: 0.69192 | val_0_auc: 0.56803 |  0:00:00s
epoch 3  | loss: 0.64759 | val_0_auc: 0.58634 |  0:00:00s
epoch 4  | loss: 0.6227  | val_0_auc: 0.73753 |  0:00:01s
epoch 5  | loss: 0.60789 | val_0_auc: 0.72162 |  0:00:01s
epoch 6  | loss: 0.6129  | val_0_auc: 0.50368 |  0:00:01s
epoch 7  | loss: 0.59161 | val_0_auc: 0.6154  |  0:00:01s
epoch 8  | loss: 0.57301 | val_0_auc: 0.71577 |  0:00:02s
epoch 9  | loss: 0.57227 | val_0_auc: 0.80863 |  0:00:02s
epoch 10 | loss: 0.56114 | val_0_auc: 0.82944 |  0:00:02s
epoch 11 | loss: 0.51929 | val_0_auc: 0.81396 |  0:00:02s
epoch 12 | loss: 0.50586 | val_0_auc: 0.56155 |  0:00:03s
epoch 13 | loss: 0.49417 | val_0_auc: 0.80093 |  0:00:03s
epoch 14 | loss: 0.48739 | val_0_auc: 0.82248 |  0:00:03s
epoch 15 | loss: 0.46188 | val_0_auc: 0.81117 |  0:00:03s
epoch 16 | loss: 0.46358 | val_0_auc: 0.80633 |  0:00

In [44]:
clf.predict_proba(X_test)[:,1]

array([0.223386  , 0.08319002, 0.0385108 , 0.09176685, 0.04919345,
       0.19923244, 0.23161966, 0.04861271, 0.08664663, 0.23657268,
       0.22641315, 0.2699132 , 0.19233683, 0.3650706 , 0.11135985,
       0.25183254, 0.14150916, 0.06177987, 0.21560884, 0.07888074,
       0.2732608 , 0.24046154, 0.0535337 , 0.05116056, 0.05334001,
       0.53874266, 0.04206464, 0.16930881, 0.05068742, 0.08305321,
       0.21142544, 0.503217  , 0.05161233, 0.25095698, 0.05718221,
       0.26443997, 0.04033526, 0.48509678, 0.05179032, 0.09558702,
       0.21836355, 0.04797493, 0.04967065, 0.15230083, 0.05678205,
       0.21082124, 0.23460396, 0.04779342, 0.11015258, 0.05022397,
       0.08631528, 0.27816197, 0.46679914, 0.04916732, 0.03982076,
       0.17741868, 0.04339772, 0.18255341, 0.20011835, 0.05034601,
       0.0502039 , 0.04956937, 0.04907627, 0.2552596 , 0.04829131,
       0.253768  , 0.08640939, 0.11726848, 0.08803324, 0.08342315,
       0.04964197, 0.05333201, 0.09571667, 0.07543675, 0.12723

# Prediction

In [46]:
df_test = pd.read_csv('/dev/shm/test_data.csv')

In [47]:
np_preprocessed, _ = preprocess(df_test)

In [49]:
preds = model.predict_proba(df_test_preprocessed)[:,1]

NameError: name 'model' is not defined

In [48]:
preds

NameError: name 'preds' is not defined

In [23]:
df_preds = df_test_preprocessed.assign(p=preds)

In [24]:
a=df_preds['p'].reset_index()

In [25]:
a.groupby('customer_ID').mean().rename(columns={"p":"prediction"}).to_csv('preds_wideframe_tabnet.csv')

In [3]:
import pyarrow as pa