In [1]:
!kaggle competitions download -c playground-series-s4e8

playground-series-s4e8.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
import zipfile
file_name = "playground-series-s4e8.zip"
output_dir = "input"
zip_file = zipfile.ZipFile(file_name)
zip_file.extractall(path=output_dir)

------------

In [3]:
!pip install -q autogluon==1.1.1

  You can safely remove it manually.


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import metrics
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
CONFIG = {
    "n_folds" : 5,
    "fold" : 0,
    "seed" : 42,
    "drop_cols" : ["id"],
    "target" : "class",
    "presets" : "optimize_for_deployment",
}

In [6]:
train_path = 'input/train.csv'
test_path = 'input/test.csv'
sub_path = 'input/sample_submission.csv'

In [7]:
df_train = pd.read_csv(train_path)
df_train.shape

(3116945, 22)

In [8]:
# class is the binary target (either e or p)
df_train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [9]:
df_train.columns

Index(['id', 'class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')

In [10]:
dict_class2label = {"e":0, "p":1}
dict_label2class = {0:"e", 1:"p",False:"e", True:"p"}
df_train[CONFIG["target"]] = df_train[CONFIG["target"]].apply(lambda x: dict_class2label[x])
df_train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,0,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,1,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,0,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,0,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,0,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [11]:
kf = KFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG["seed"])

for fold, ( _, val_) in enumerate(kf.split(df_train, df_train[CONFIG["target"]])):
      df_train.loc[val_ , "kfold"] = int(fold)

In [12]:
def prepare_datasets(df, fold, drop_columns=[]):
    _df_train = df[df.kfold != fold].reset_index(drop=True)
    _df_valid = df[df.kfold == fold].reset_index(drop=True)

    # drop "id" column
    _df_train = _df_train.drop(columns=drop_columns)
    _df_valid = _df_valid.drop(columns=drop_columns)

    train_data = TabularDataset(_df_train)
    valid_data = TabularDataset(_df_valid)
    return train_data, valid_data

In [13]:
train_data, valid_data = prepare_datasets(
    df_train, CONFIG["fold"], drop_columns=CONFIG["drop_cols"]
)

In [14]:
train_data.shape

(2493556, 22)

In [15]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html
# https://auto.gluon.ai/0.3.0/tutorials/tabular_prediction/tabular-custom-metric.html
ag_mcc_scorer = make_scorer(name='mcc',
                                 score_func=metrics.matthews_corrcoef,
                                 optimum=1,
                                 greater_is_better=True)

In [16]:
# predictor = TabularPredictor(
#     label=CONFIG["target"],
#     problem_type="binary",
#     path="predictor",
# )

In [17]:
# %%time
# predictor.fit(
#     train_data,
#     tuning_data=valid_data,
#     save_space=True,
#     presets=CONFIG["presets"],
#     use_bag_holdout=True,
#     ag_args_fit={'num_gpus': 1},
# )

In [18]:
# predictor.leaderboard(valid_data, extra_metrics=[ag_mcc_scorer], silent=True)

In [19]:
# #predictor.leaderboard(valid_data, extra_metrics=[ag_mcc_scorer], silent=True)
# predictor.leaderboard()

In [24]:
predictor = TabularPredictor.load("predictor",require_py_version_match=False)

Found 2 mismatches between original and current metadata:


In [25]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.992473,accuracy,78.919607,2461.855178,0.028786,8.970034,2,True,6
1,LightGBMXT,0.992363,accuracy,17.849845,180.977263,17.849845,180.977263,1,True,1
2,LightGBMLarge,0.992262,accuracy,46.967245,433.327905,46.967245,433.327905,1,True,5
3,NeuralNetTorch,0.992077,accuracy,4.241214,1448.539958,4.241214,1448.539958,1,True,4
4,RandomForestGini,0.991995,accuracy,4.916825,246.639827,4.916825,246.639827,1,True,2
5,ExtraTreesEntr,0.991719,accuracy,4.915693,143.400192,4.915693,143.400192,1,True,3


In [26]:
df_test = pd.read_csv(test_path)
df_test = df_test.drop(columns=CONFIG["drop_cols"])
test_data = TabularDataset(df_test)

In [27]:
test_data.shape

(2077964, 20)

In [29]:
import sklearn
print(sklearn.__version__)

1.4.0


In [28]:
df_test["pred_label"] = predictor.predict(test_data)
df_test["pred_class"] = df_test["pred_label"].apply(lambda x: dict_label2class[x])

df_test.to_csv("preds.csv", index=False)
display(df_test[["pred_label", "pred_class"]])

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


AttributeError: 'DecisionTreeClassifier' object has no attribute 'monotonic_cst'

In [None]:
df_sub = pd.read_csv(sub_path)
df_sub[ CONFIG["target"] ] = df_test["pred_class"]
df_sub.to_csv("submission.csv", index=False)