In [1]:
# The Usuals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn stuff
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# category encoders
!pip install category-encoders
import category_encoders as ce



## Data

In [2]:
!chmod 777 ../Data/train.csv
!chmod 777 ../Data/test.csv

# get train and test sets
trainval = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

# Find unique values 
trainval_molecules = trainval['molecule_name'].unique()
test_molecules = test['molecule_name'].unique()

# Train/Val Split
train_molecules, val_molecules = train_test_split(
    trainval_molecules, random_state=32)

train = trainval[trainval['molecule_name'].isin(train_molecules)]
val = trainval[trainval['molecule_name'].isin(val_molecules)]

In [3]:
train.shape, val.shape, test.shape

((3492283, 6), (1165864, 6), (2505542, 5))

# Feature and Target Selection

In [4]:
# features & target
features = ['type']
target = 'scalar_coupling_constant'

# Train
X_train = train[features]
y_train = train[target]

# Val
X_val = val[features]
y_val = val[target]

# Encode

In [5]:
# Encode using the ordinal encoder
encoder = ce.OrdinalEncoder()
X_train = pd.DataFrame(encoder.fit_transform(X_train.values))
X_val = pd.DataFrame(encoder.transform(X_val.values))

# sklearn Model

In [6]:
# metric
groups = X_train[0]

def group_mean_log_mae(y_true, y_pred, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(groups).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [7]:
# Instantiate rf
rf = RandomForestRegressor(n_estimators=100,
                           max_depth=25,
                           random_state=32)

# fit
rf.fit(X_train, y_train)

# make predictions
y_pred = rf.predict(X_val)

# print accuracy
print(group_mean_log_mae(y_val, y_pred))

1.4529616324460817


# sklearn Hyperparameter Optimization

In [9]:
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    RandomForestRegressor(random_state=42)
)

param_distributions = {
    'randomforestregressor__n_estimators': randint(50, 500), 
    'randomforestregressor__max_features': uniform(), 
    'randomforestregressor__min_samples_leaf':  [1, 10, 100]
}

search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=5, 
    cv=2, 
    scoring='neg_mean_squared_error', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train, y_train, groups=groups);

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  1.7min remaining:  4.0min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  3.1min remaining:  3.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  3.9min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.4min finished


In [10]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation RMSLE', np.sqrt(-search.best_score_))

Best hyperparameters {'randomforestregressor__max_features': 0.3648470255325348, 'randomforestregressor__min_samples_leaf': 10, 'randomforestregressor__n_estimators': 80}
Cross-validation RMSLE 8.114835486980716


# Another sklearn Model

In [15]:
# Instantiate rf
rf = RandomForestRegressor(n_estimators=80,
                           max_features=0.36,
                           min_samples_leaf=10,
                           max_depth=25,
                           random_state=32)

# fit
rf.fit(X_train, y_train)

# make predictions
y_pred = rf.predict(X_val)

# print accuracy
print(group_mean_log_mae(y_val, y_pred))

1.452978974603622


# XGBoost Model

In [11]:
from xgboost import XGBRegressor

# instantiate model
model = XGBRegressor(n_estimators=100, max_depth=10, random_state=32)

# fit model
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_val)

# print accuracy
print(group_mean_log_mae(y_val, y_pred))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


1.4529320990612868


# XGBoost Hyperparameter Optimization

In [12]:
from xgboost import XGBRegressor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV

pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    XGBRegressor(random_state=42)
)

param_distributions = {
    'xgbregressor__n_estimators': randint(100, 1000), 
    'xgbregressor__max_depth': randint(3, 50)
}

search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_distributions, 
    n_iter=5, 
    cv=2, 
    scoring='neg_mean_squared_error', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)

search.fit(X_train, y_train, groups=groups);

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  6.2min remaining: 14.5min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  6.3min remaining:  6.3min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  6.4min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  8.5min finished




In [13]:
print('Best hyperparameters', search.best_params_)
print('Cross-validation RMSLE', np.sqrt(-search.best_score_))

Best hyperparameters {'xgbregressor__max_depth': 39, 'xgbregressor__n_estimators': 893}
Cross-validation RMSLE 8.114939925730262


# Another XGBoost Model

In [16]:
from xgboost import XGBRegressor

# instantiate model
model = XGBRegressor(n_estimators=893, max_depth=39, random_state=32)

# fit model
model.fit(X_train, y_train)

# make predictions
y_pred = model.predict(X_val)

# print accuracy
print(group_mean_log_mae(y_val, y_pred))

1.4529622464242213
