In [1]:
import pathlib

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import ensemble ,compose, impute, metrics, linear_model, pipeline, preprocessing

In [2]:
INPUT_DIR = pathlib.Path("/kaggle/input/kaust-academy-ai-week-november-2022")
WORKING_DIR = pathlib.Path("/kaggle/working")

In [3]:
!ls -lh $INPUT_DIR

total 1.2M
-rw-r--r-- 1 nobody nogroup  59K Nov 27 05:24 sample_submission.csv
-rw-r--r-- 1 nobody nogroup 364K Nov 27 05:24 test.csv
-rw-r--r-- 1 nobody nogroup 787K Nov 27 05:24 train.csv


In [4]:
# Q1: load the training data
train_df = pd.read_csv(INPUT_DIR / "train.csv", index_col=0)
train_df.head(0)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1


In [5]:
train_df["ShoppingMall"].nunique()

1115

In [6]:
train_features = train_df.drop("Transported", axis=1)
train_target = train_df["Transported"]

In [7]:
boolean_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
)

categorical_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
    preprocessing.OneHotEncoder(),
)

numeric_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="mean")
)

data_preprocessing = compose.make_column_transformer(
    (boolean_preprocessing, ["CryoSleep", "VIP"]),
    (categorical_preprocessing, ["HomePlanet", "Destination"]),
    (numeric_preprocessing, compose.make_column_selector(dtype_include=np.float64)),
    remainder = "drop",
)


In [8]:
ensemble.GradientBoostingClassifier?

In [9]:
_seed = 42
_hyperparameters = {
    "learning_rate": 0.08,
    "n_estimators": 180,
    "subsample": 0.8,
    "min_weight_fraction_leaf": 0.008,
    "tol": 0.00008,
    "min_samples_split": 8,
    #"max_features": "log2"
}

estimator = ensemble.GradientBoostingClassifier(**_hyperparameters)

ml_pipeline = pipeline.make_pipeline(
    data_preprocessing,
    estimator
)
_ = ml_pipeline.fit(train_features, train_target)

In [10]:
# Q3: load the testing features
test_features = pd.read_csv(INPUT_DIR / "test.csv", index_col=0)

In [11]:
predictions = ml_pipeline.predict(test_features)

In [12]:
# Q4: load the sample submission file!
sample_submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv", index_col=0)

In [13]:
_ = (pd.DataFrame({"Transported": predictions}, index=sample_submission_df.index)
       .to_csv(WORKING_DIR / "submission.csv"))  

In [14]:
!cat $WORKING_DIR/submission.csv | head

PassengerId,Transported
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
0027_01,True
0029_01,True
0032_01,True
0032_02,True


In [15]:
_train_predictions = ml_pipeline.predict(train_features)
_report = metrics.classification_report(
    _train_predictions,
    train_target
)
print(_report)

              precision    recall  f1-score   support

       False       0.77      0.83      0.80      4022
        True       0.84      0.79      0.82      4671

    accuracy                           0.81      8693
   macro avg       0.81      0.81      0.81      8693
weighted avg       0.81      0.81      0.81      8693

