In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import warnings
# setting ignore as a parameter and further adding category
warnings.filterwarnings(action='ignore', category=FutureWarning) 
warnings.filterwarnings(action='ignore', category=UserWarning) 

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from tqdm.auto import tqdm

In [8]:
import mlflow
mlflow.autolog()  

2023/05/24 18:27:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [9]:
!ls ../data/input

[34mmlruns[m[m                test.csv
sample_submission.csv train.csv


## Read raw data

In [10]:
!ls ..

LICENSE        [34mdata[m[m           poetry.lock
README.md      [34mnotebooks[m[m      pyproject.toml


In [11]:
train = pd.read_csv('../data/input/train.csv')
test = pd.read_csv('../data/input/test.csv')
sample_submission = pd.read_csv('../data/input/sample_submission.csv')

In [12]:
train.shape
train.head()

(600000, 25)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [13]:
train.dtypes
train.dtypes.value_counts()

id          int64
bin_0     float64
bin_1     float64
bin_2     float64
bin_3      object
bin_4      object
nom_0      object
nom_1      object
nom_2      object
nom_3      object
nom_4      object
nom_5      object
nom_6      object
nom_7      object
nom_8      object
nom_9      object
ord_0     float64
ord_1      object
ord_2      object
ord_3      object
ord_4      object
ord_5      object
day       float64
month     float64
target      int64
dtype: object

object     17
float64     6
int64       2
Name: count, dtype: int64

In [14]:
train.target.value_counts()
train.target.value_counts(normalize=True)

target
0    487677
1    112323
Name: count, dtype: int64

target
0    0.812795
1    0.187205
Name: proportion, dtype: float64

### Concatenate train and test data

In [15]:
test['target'] = -1

In [16]:
data = pd.concat([train, test])
data.shape
data.head()

(1000000, 25)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


## Select feature columns

- We are interested in generating the embedings of categoricla features -- all are categorical features in the data

In [17]:
features = [column for column in data.columns if column not in ['id', 'target']]
features
target = 'target'
len(features)

['bin_0',
 'bin_1',
 'bin_2',
 'bin_3',
 'bin_4',
 'nom_0',
 'nom_1',
 'nom_2',
 'nom_3',
 'nom_4',
 'nom_5',
 'nom_6',
 'nom_7',
 'nom_8',
 'nom_9',
 'ord_0',
 'ord_1',
 'ord_2',
 'ord_3',
 'ord_4',
 'ord_5',
 'day',
 'month']

23

## LabelEncode categorical columns

In [18]:
for feat in features:
    label_encoder = preprocessing.LabelEncoder()
    data[feat] = label_encoder.fit_transform(data[feat].fillna('-1').astype(str).values)

In [19]:
data.shape
data.isna().sum().sum()

data.head()

(1000000, 25)

0

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,1,1,1,1,1,3,5,4,6,...,28,3,1,4,3,21,57,6,6,0
1,1,2,2,1,1,2,3,4,1,0,...,2113,3,3,6,5,24,151,7,10,0
2,2,1,2,1,1,1,3,0,4,1,...,0,3,0,3,14,16,106,5,12,0
3,3,0,1,1,1,1,3,1,4,4,...,2168,1,5,5,1,3,0,3,6,0
4,4,1,0,1,2,1,3,6,4,3,...,1748,3,3,2,8,3,51,5,4,0


### Number of unique levels in each feature

In [20]:
data.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,1,1,1,1,1,3,5,4,6,...,28,3,1,4,3,21,57,6,6,0
1,1,2,2,1,1,2,3,4,1,0,...,2113,3,3,6,5,24,151,7,10,0
2,2,1,2,1,1,1,3,0,4,1,...,0,3,0,3,14,16,106,5,12,0
3,3,0,1,1,1,1,3,1,4,4,...,2168,1,5,5,1,3,0,3,6,0
4,4,1,0,1,2,1,3,6,4,3,...,1748,3,3,2,8,3,51,5,4,0


In [21]:
# feature_num_unique_levels = [data[feat].nunique() for feat in features]
# len(feature_num_unique_levels)
# feature_num_unique_levels

print(f"feature: num_unique_levels \n-------------------------")
for feature in features:
    
    print(f'{feature}: {data[feature].nunique()}')

feature: num_unique_levels 
-------------------------
bin_0: 3
bin_1: 3
bin_2: 3
bin_3: 3
bin_4: 3
nom_0: 4
nom_1: 7
nom_2: 7
nom_3: 7
nom_4: 5
nom_5: 1221
nom_6: 1521
nom_7: 223
nom_8: 223
nom_9: 2219
ord_0: 4
ord_1: 6
ord_2: 7
ord_3: 16
ord_4: 27
ord_5: 191
day: 8
month: 13


## Split train and test back

In [22]:
train = data.loc[data.target!=-1, :]
test = data.loc[data.target==-1, :]

train.shape, test.shape

((600000, 25), (400000, 25))

In [23]:
train.head()
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,1,1,1,1,1,3,5,4,6,...,28,3,1,4,3,21,57,6,6,0
1,1,2,2,1,1,2,3,4,1,0,...,2113,3,3,6,5,24,151,7,10,0
2,2,1,2,1,1,1,3,0,4,1,...,0,3,0,3,14,16,106,5,12,0
3,3,0,1,1,1,1,3,1,4,4,...,2168,1,5,5,1,3,0,3,6,0
4,4,1,0,1,2,1,3,6,4,3,...,1748,3,3,2,8,3,51,5,4,0


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,600000,1,1,1,1,2,1,2,1,4,...,2197,3,5,1,6,21,147,3,12,-1
1,600001,1,1,1,1,2,3,1,5,6,...,1108,1,5,2,14,14,0,2,11,-1
2,600002,1,1,1,1,2,1,1,1,6,...,813,1,2,6,9,14,13,2,9,-1
3,600003,2,1,1,1,1,3,2,1,3,...,997,1,2,4,13,2,1,1,9,-1
4,600004,1,1,2,1,2,3,1,0,4,...,372,1,1,5,15,10,15,3,6,-1


## Split into train and validation

In [24]:
X = train.drop(columns=['id', 'target'])
y = train['target']

In [25]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_X.shape,  len(train_y)
val_X.shape, len(val_y)

((480000, 23), 480000)

((120000, 23), 120000)

## Feature selection

In [26]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [27]:
select_best = SelectKBest(chi2, k=15)
rf_model = RandomForestClassifier(oob_score=True)

In [28]:
pipe = Pipeline([('select_best', select_best), ('clf', rf_model)])

## Hyper-param tuning

In [29]:
hyper_params_space = {
    'select_best__k': [10, 15, 20],
    'clf__max_depth': [7, 10],
    'clf__n_estimators': [100, 150, 200, 225]
}

In [30]:
import os
n_cpu = os.cpu_count()

n_jobs = n_cpu - 2
print(f"n_cpu: {n_cpu}, n_jobs: {n_jobs}")

n_cpu: 10, n_jobs: 8


In [31]:
clf = GridSearchCV(pipe, hyper_params_space, cv=3, n_jobs=n_jobs)

In [32]:
clf.fit(train_X, train_y)

2023/05/24 18:31:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '90bbffd49ed34113876092d4faeb713f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/05/24 18:39:02 INFO mlflow.sklearn.utils: Logging the 5 best runs, 19 runs will be omitted.


In [35]:
clf.best_params_

{'clf__max_depth': 10, 'clf__n_estimators': 100, 'select_best__k': 10}

In [36]:
clf.refit
clf.get_params()

True

{'cv': 3,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('select_best',
   SelectKBest(k=15, score_func=<function chi2 at 0x288a99ea0>)),
  ('clf', RandomForestClassifier(oob_score=True))],
 'estimator__verbose': False,
 'estimator__select_best': SelectKBest(k=15, score_func=<function chi2 at 0x288a99ea0>),
 'estimator__clf': RandomForestClassifier(oob_score=True),
 'estimator__select_best__k': 15,
 'estimator__select_best__score_func': <function sklearn.feature_selection._univariate_selection.chi2(X, y)>,
 'estimator__clf__bootstrap': True,
 'estimator__clf__ccp_alpha': 0.0,
 'estimator__clf__class_weight': None,
 'estimator__clf__criterion': 'gini',
 'estimator__clf__max_depth': None,
 'estimator__clf__max_features': 'sqrt',
 'estimator__clf__max_leaf_nodes': None,
 'estimator__clf__max_samples': None,
 'estimator__clf__min_impurity_decrease': 0.0,
 'estimator__clf__min_samples_leaf': 1,
 'estimator__clf__min_samples_split': 2,
 'estimator__clf__min_weight_fr

In [49]:
def get_eval_metrics(model, test_X, test_y):
    # X = test_data.drop(columns=[target_column, 'id'])
    X = test_X
    y = test_y

    prediction_probs = model.predict_proba(X)[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y, prediction_probs, pos_label=1)

    auc = metrics.auc(fpr, tpr)

    return auc

    

In [50]:
auc = get_eval_metrics(clf, val_X, val_y)
auc  # 0.7060452735035473

0.7064317548651491

In [65]:
pipe.fit(train_X, train_y)

In [67]:
pipe.feature_names_in_

array(['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8',
       'nom_9', 'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5',
       'day', 'month'], dtype=object)

In [68]:
dir(pipe)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_is_fitted__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_can_inverse_transform',
 '_can_transform',
 '_check_feature_names',
 '_check_fit_params',
 '_check_n_features',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_param_names',
 '_get_params',
 '_get_tags',
 '_iter',
 '_log_message',
 '_more_tags',
 '_replace_estimator',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_set_params',
 '_sk_visual_block_',
 '_validate_data',
 '_validate_names',
 '_validate_params',
 '_validate_steps',
 

In [70]:
pipe.get_params()

{'memory': None,
 'steps': [('select_best',
   SelectKBest(k=15, score_func=<function chi2 at 0x28e2ec280>)),
  ('clf', RandomForestClassifier(oob_score=True))],
 'verbose': False,
 'select_best': SelectKBest(k=15, score_func=<function chi2 at 0x28e2ec280>),
 'clf': RandomForestClassifier(oob_score=True),
 'select_best__k': 15,
 'select_best__score_func': <function sklearn.feature_selection._univariate_selection.chi2(X, y)>,
 'clf__bootstrap': True,
 'clf__ccp_alpha': 0.0,
 'clf__class_weight': None,
 'clf__criterion': 'gini',
 'clf__max_depth': None,
 'clf__max_features': 'sqrt',
 'clf__max_leaf_nodes': None,
 'clf__max_samples': None,
 'clf__min_impurity_decrease': 0.0,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__n_estimators': 100,
 'clf__n_jobs': None,
 'clf__oob_score': True,
 'clf__random_state': None,
 'clf__verbose': 0,
 'clf__warm_start': False}

In [72]:
prediction_probs = pipe.predict_proba(val_X)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(val_y, prediction_probs, pos_label=1)

auc = metrics.auc(fpr, tpr)
auc

0.7039908487141491

## Model training

### With base RF model 

- no hyper-param tuning

In [94]:
rf_model = RandomForestClassifier(oob_score=True)
rf_model.fit(train_X, train_y)

In [97]:
rf_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': True,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [98]:
oob_accuracy = rf_model.oob_score_
oob_error = 1 - rf_model.oob_score_
print(f"oob_accuracy: {oob_accuracy: 0.3f}")

oob_accuracy:  0.814


In [99]:
X_tmp = train_X.head(6)
y_tmp = train_y[:6]

X_tmp
y_tmp

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
126647,1,1,1,2,1,3,0,1,5,4,...,81,501,2,3,3,4,13,134,6,12
369096,1,0,1,1,2,3,2,5,1,1,...,217,706,1,3,3,14,1,34,5,4
190780,1,1,2,2,2,3,6,3,6,4,...,33,1589,3,2,3,14,5,164,7,11
451372,1,1,2,2,2,3,1,4,0,4,...,144,2059,3,5,1,8,16,66,2,11
68645,1,1,1,1,2,3,5,3,5,4,...,17,1044,1,3,4,5,13,159,7,6
574690,1,2,2,2,1,3,6,4,5,1,...,82,217,3,1,3,8,2,97,5,6


126647    0
369096    0
190780    1
451372    0
68645     0
574690    0
Name: target, dtype: int64

In [100]:
from sklearn import metrics

In [102]:
auc = get_eval_metrics(model=rf_model, test_data=val_df, target_column='target')
auc  # 0.72

0.7211612310673776

## <font color='red'> Todo </font>
1. Include learning rate scheudler  <font color='green'> **Done** </font>
2. Write prediction code, generate submission file, and submit  <font color='green'> **Done** </font>
3. Train RF model and see the performance on raw-data
4. Integrate `mlflow` to track experiments
4. Train RF model on the generated embeddings from NN model and see the performance
5. Keep pushing code to github

## Prediction

In [203]:
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,600000,1,1,1,1,2,1,2,1,4,...,2197,3,5,1,6,21,147,3,12,-1
1,600001,1,1,1,1,2,3,1,5,6,...,1108,1,5,2,14,14,0,2,11,-1
2,600002,1,1,1,1,2,1,1,1,6,...,813,1,2,6,9,14,13,2,9,-1
3,600003,2,1,1,1,1,3,2,1,3,...,997,1,2,4,13,2,1,1,9,-1
4,600004,1,1,2,1,2,3,1,0,4,...,372,1,1,5,15,10,15,3,6,-1


### Generate submission file 

In [197]:
submission_df = pd.DataFrame(columns=['id', 'target'])
submission_df
submission_df['id'] = test.id
submission_df['target'] = predictions

submission_df.shape
submission_df.head()

Unnamed: 0,id,target


(400000, 2)

Unnamed: 0,id,target
0,600000,0.177918
1,600001,0.291743
2,600002,0.130553
3,600003,0.062774
4,600004,0.128792


In [198]:
submission_df.to_csv('../data/output/catg_enc_submission.csv', index=False)

## Upload submission file to Kaggle