In [1]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor

import time

In [2]:
# Read the data
csv_train_file = pd.read_csv('./siim-isic-melanoma-classification/train.csv')
csv_test_file = pd.read_csv('./siim-isic-melanoma-classification/test.csv')

# Separate target from predictors
y = csv_train_file.target
csv_train_file.drop(['target', 'benign_malignant', 'diagnosis'], axis=1, inplace=True)

X_train_load = csv_train_file
X_test_load = csv_test_file

In [3]:
print(f'Number of Training Examples = {X_train_load.shape[0]}')
print(f'Number of Test Examples = {X_test_load.shape[0]}\n')

print(f'Training X Shape = {X_train_load.shape}')
print(f'Training y Shape = {y.shape}\n')

print(f'Test X Shape = {X_test_load.shape}')

Number of Training Examples = 33126
Number of Test Examples = 10982

Training X Shape = (33126, 5)
Training y Shape = (33126,)

Test X Shape = (10982, 5)


In [4]:
def concat_df(X_train, X_test):
    # Returns a concatenated df of training and test set
    return pd.concat([X_train, X_test], sort=True).reset_index(drop=True)

def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:33125], all_data.loc[33126:]

In [5]:
X_all = concat_df(X_train_load, X_test_load)

X_all.shape

(44108, 5)

In [6]:
# Select categorical columns
categorical_cols = [cname for cname in X_all.columns if
                    X_all[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_all.columns if 
                X_all[cname].dtype in ['int64', 'float64']]

# Keep selected columns only - only relevant if we want to discard any columns on X_train (unlikely)
my_cols = categorical_cols + numerical_cols
X_all_card = X_all[my_cols].copy()

X_all_card.shape

(44108, 5)

In [7]:
X_train, X_test = divide_df(X_all_card)

print(f'Number of Training Examples = {X_train.shape[0]}')
print(f'Number of Test Examples = {X_test.shape[0]}')

Number of Training Examples = 33126
Number of Test Examples = 10982


In [8]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
                                ('imputern', SimpleImputer(strategy='constant')),
                                ('scaler', StandardScaler())
                                 ])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
                                ('imputerc', SimpleImputer(strategy='constant')),
                                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
                                ])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [17]:
def get_score(param_grid):

    score_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', XGBRegressor(objective='binary:logistic',
                               learning_rate=0.01,
                               tree_method='gpu_hist'
                              ))])
  
    
    search = GridSearchCV(score_pipeline, 
                          param_grid, 
                          #n_jobs=4,
                          cv=5,
                          verbose=3,
                          scoring='roc_auc'
                         )
    
    search.fit(X_train, y)
    
    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)
    return search

In [15]:
param_grid = {
'preprocessor__num__imputern__strategy': ['constant', 'most_frequent'],
'preprocessor__cat__imputerc__strategy': ['constant', 'most_frequent'],
'model__n_estimators': range(100, 200, 100)
}

start = time.time()
grid = get_score(param_grid)
elapsed = time.time() - start

print("Time taken: ", elapsed)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant, score=0.682, total=  31.3s
[CV] model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.3s remaining:    0.0s


[CV]  model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant, score=0.719, total=  30.4s
[CV] model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.0min remaining:    0.0s


[CV]  model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant, score=0.657, total=  30.3s
[CV] model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant 
[CV]  model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant, score=0.659, total=  30.4s
[CV] model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant 
[CV]  model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=constant, score=0.726, total=  30.0s
[CV] model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=most_frequent 
[CV]  model__n_estimators=100, preprocessor__cat__imputerc__strategy=constant, preprocessor__num__imputern__strategy=most_frequent, score=0.682, total=  30

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 10.2min finished


Best parameter (CV score=0.688):
{'model__n_estimators': 100, 'preprocessor__cat__imputerc__strategy': 'constant', 'preprocessor__num__imputern__strategy': 'constant'}
Time taken:  652.7218263149261


In [18]:
# Build final model based on best parameters

start = time.time()
submission_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(objective ='binary:logistic',
                           random_state=0,
                           n_estimators=100,
                           learning_rate=0.01,
                           tree_method='gpu_hist'
                          ))
    ])

submission_pipeline.fit(X_train, y)
elapsed = time.time() - start
print("Time taken: ", elapsed)

Time taken:  42.81570482254028


In [19]:
preds_test = submission_pipeline.predict(X_test)

In [20]:
# Save test predictions to file
output = pd.DataFrame({'image_name': X_test.image_name,
                       'target': preds_test})
output.to_csv('./cleaned_csvs/csv_only_submission.csv', index=False)