## Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedGroupKFold, cross_val_score

from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

## Data Loading

In [2]:
basedir = r"C:/Pro/Cours/A5 - IPP/DataCamp/Individual_Ramp_Challenge/Datacamp-Challenge-Volcanic-events-prediction-from-tephras/data/"

train = pd.read_csv(basedir + 'train_imputed.csv')
test = pd.read_csv(basedir + 'test_imputed.csv')
groups = pd.read_csv(basedir + 'train.csv')["SampleID"].astype("category").cat.codes.to_numpy()

In [3]:
X_train = train.drop("Event", axis=1)
y_train = train["Event"]

X_test = test.drop("Event", axis=1)
y_test = test["Event"]

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [4]:
majors = ['SiO2_normalized', 'TiO2_normalized', 'Al2O3_normalized',
          'FeOT_normalized',
          # 'FeO_normalized', 'Fe2O3_normalized', 'Fe2O3T_normalized',
          'MnO_normalized', 'MgO_normalized', 'CaO_normalized',
          'Na2O_normalized', 'K2O_normalized',
          # 'P2O5_normalized','Cl_normalized'
          ]
traces = ['Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Cs', 'Ba', 'La',
          'Ce', 'Pr', 'Nd', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy',
          'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'Pb',
          'Th', 'U']

label = "Event"

In [5]:

# get column index of majors and traces

majors_idx = [X_train.columns.get_loc(c) for c in majors if c in X_train]
traces_idx = [X_train.columns.get_loc(c) for c in traces if c in X_train]

In [6]:
majors_idx

[]

In [7]:
traces_idx

[]

## K-Nearest Neighbors

In [8]:
"""
class Classifier(BaseEstimator):
    def __init__(self):
        self.transformer = Pipeline(
            steps=[
                ("imputer_majors", IterativeImputer(random_state=0, estimator=BayesianRidge(), max_iter=10)),
                ("imputer_traces", IterativeImputer(random_state=0, estimator=SVR(kernel='linear'), max_iter=10)),
                ("scaler", MinMaxScaler()),
            ]
        )
        self.model = KNeighborsClassifier(n_neighbors=5)
        self.pipe = make_pipeline(self.transformer, self.model)

    def fit(self, X, y):
        self.pipe.fit(X, y)

    def predict(self, X):
        return self.pipe.predict(X)

    def predict_proba(self, X):
        return self.pipe.predict_proba(X)
"""

'\nclass Classifier(BaseEstimator):\n    def __init__(self):\n        self.transformer = Pipeline(\n            steps=[\n                ("imputer_majors", IterativeImputer(random_state=0, estimator=BayesianRidge(), max_iter=10)),\n                ("imputer_traces", IterativeImputer(random_state=0, estimator=SVR(kernel=\'linear\'), max_iter=10)),\n                ("scaler", MinMaxScaler()),\n            ]\n        )\n        self.model = KNeighborsClassifier(n_neighbors=5)\n        self.pipe = make_pipeline(self.transformer, self.model)\n\n    def fit(self, X, y):\n        self.pipe.fit(X, y)\n\n    def predict(self, X):\n        return self.pipe.predict(X)\n\n    def predict_proba(self, X):\n        return self.pipe.predict_proba(X)\n'

In [9]:
"""
class Classifier(BaseEstimator):
    def __init__(self):
        self.transformer = Pipeline([
            ("impute_scale", ColumnTransformer([
                ("imputer_majors", IterativeImputer(random_state=0, estimator=BayesianRidge(), max_iter=10), majors),
                ("imputer_traces", IterativeImputer(random_state=0, estimator=SVR(kernel='linear'), max_iter=10), traces),
            ], remainder='passthrough')),
            ("scaler", MinMaxScaler()),
        ])
        self.model = KNeighborsClassifier(n_neighbors=5)
        self.pipe = make_pipeline(self.transformer, self.model)

    def fit(self, X, y):
        self.pipe.fit(X, y)

    def predict(self, X):
        return self.pipe.predict(X)

    def predict_proba(self, X):
        return self.pipe.predict_proba(X)
"""

'\nclass Classifier(BaseEstimator):\n    def __init__(self):\n        self.transformer = Pipeline([\n            ("impute_scale", ColumnTransformer([\n                ("imputer_majors", IterativeImputer(random_state=0, estimator=BayesianRidge(), max_iter=10), majors),\n                ("imputer_traces", IterativeImputer(random_state=0, estimator=SVR(kernel=\'linear\'), max_iter=10), traces),\n            ], remainder=\'passthrough\')),\n            ("scaler", MinMaxScaler()),\n        ])\n        self.model = KNeighborsClassifier(n_neighbors=5)\n        self.pipe = make_pipeline(self.transformer, self.model)\n\n    def fit(self, X, y):\n        self.pipe.fit(X, y)\n\n    def predict(self, X):\n        return self.pipe.predict(X)\n\n    def predict_proba(self, X):\n        return self.pipe.predict_proba(X)\n'

In [10]:
class Classifier(BaseEstimator):
    def __init__(self):
        self.transformer = Pipeline([
            ("impute_scale", ColumnTransformer([
                ("imputer_majors", IterativeImputer(random_state=0, estimator=BayesianRidge(), max_iter=10), majors),
                ("imputer_traces", SimpleImputer(strategy="median"), traces),
            ], remainder='passthrough')),
            ("scaler", StandardScaler()),
        ])
        self.model = HistGradientBoostingClassifier()
        self.pipe = make_pipeline(self.transformer, self.model)

    def fit(self, X, y):
        self.pipe.fit(X, y)

    def predict(self, X):
        return self.pipe.predict(X)

    def predict_proba(self, X):
        return self.pipe.predict_proba(X)

##### grid search

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6220, 35)
(839, 35)
(6220,)
(839,)


In [17]:
params = {'max_iter': [1000,1200,1500],
          'learning_rate': [0.1],
          'max_depth' : [25, 50, 75],
          'l2_regularization': [1.5],
          'scoring': ['f1_micro']
}

gs = GridSearchCV(HistGradientBoostingClassifier(),
                  param_grid=params,
                  cv=StratifiedGroupKFold(n_splits=2),
                  scoring="balanced_accuracy",
                  verbose=1)

gs.fit(X_train, y_train, groups=groups)
print("Score: ", gs.score(X_test, y_test), "\n\n")
print(gs.best_params_)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
Score:  0.8054194396841455 


{'l2_regularization': 1.5, 'learning_rate': 0.1, 'max_depth': 25, 'max_iter': 1500, 'scoring': 'f1_micro'}


In [97]:
clf = Classifier()
clf.fit(X_train, y_train)
sf = StratifiedGroupKFold(n_splits=5)
sf.split(X_train, y_train, groups)
cross_val_score(clf, X_train, y_train, groups=groups, cv=sf, scoring='accuracy').mean()

0.5751250483243812

In [None]:
!ramp-test --submission submissions

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Pro\Cours\A5 - IPP\DataCamp\data_camp_venv\Scripts\ramp-test.exe\__main__.py", line 7, in <module>
  File "C:\Pro\Cours\A5 - IPP\DataCamp\data_camp_venv\Lib\site-packages\rampwf\utils\cli\testing.py", line 117, in start
    main()
  File "C:\Pro\Cours\A5 - IPP\DataCamp\data_camp_venv\Lib\site-packages\click\core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Pro\Cours\A5 - IPP\DataCamp\data_camp_venv\Lib\site-packages\click\core.py", line 1078, in main
    rv = self.invoke(ctx)
         ^^^^^^^^^^^^^^^^
  File "C:\Pro\Cours\A5 - IPP\DataCamp\data_camp_venv\Lib\site-packages\click\core.py", line 1434, in invoke
    return ctx.invoke(self.callback, **ctx.params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Pro\Cours\A5 - IPP\DataCamp\data_camp_venv\Lib\sit