In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tools import *

import os
import warnings
warnings.filterwarnings('ignore')

import gc
import joblib

# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupShuffleSplit, cross_val_score

In [2]:
CACHE_PATH = '/Users/angus/Downloads/kaggle/'
cpath = lambda x: CACHE_PATH + x

In [3]:
df = pd.read_pickle(cpath('df_feats_enc.pkl'))
df_structures = pd.read_csv('../input/structures.csv');

In [4]:
%time last_model = joblib.load(cpath('latest_model.joblib'))

CPU times: user 2.83 s, sys: 3.4 s, total: 6.23 s
Wall time: 6.37 s


In [18]:
df = get_portion(df, r=0.1)

# Feature Selection

In [30]:
feats = [
#     'molecule_name', 'atom_index_0', 'atom_index_1', 'scalar_coupling_constant',
#     'fc', 'sd', 'pso', 'dso',
#     'mulliken_charge',
#     'dip_x', 'dip_y', 'dip_z',
#     'potential_energy',
#     'XX', 'YX', 'ZX', 'XY','YY', 'ZY', 'XZ', 'YZ', 'ZZ',
#     'atom_0',
    'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1',
    'dist',
#     'n_C', 'n_H', 'n_N',
    'mol_min_dist', 'mol_max_dist', 'mol_mean_dist', 'mol_med_dist', 'mol_kur_dist', 'mol_std_dist',
    'atom_0_min_dist', 'atom_0_max_dist', 'atom_0_mean_dist', 'atom_0_med_dist', 'atom_0_kur_dist', 'atom_0_std_dist',
    'atom_1_min_dist', 'atom_1_max_dist', 'atom_1_mean_dist', 'atom_1_med_dist', 'atom_1_kur_dist', 'atom_1_std_dist',
    'nearby_C', 'nearby_H', 'nearby_N',
    'nearby_C', 'nearby_H', 'nearby_N',
    'type',
    'atom_1',
]
meta_feats = ['fc_pred', 'sd_pred', 'pso_pred', 'dso_pred']
categoricals = ['type', 'atom_1']
info_cols = ['molecule_name', 'atom_index_0', 'atom_index_1', 'scalar_coupling_constant']
feats = list(set(feats) - set(categoricals) - set(info_cols))

In [29]:
%%time
target = 'fc'
X = df.loc[:, feats + ['molecule_name']]
y = df.loc[:, [target]]

fc_model = constant_model(X, feats, target)
df[f"{target}_pred"] = fc_model.predict(X[feats])
importances(feats, fc_model)
del fc_model
gc.collect()

TARGET: fc
FEATURES (n = 28):
z_0                 atom_0_mean_dist    atom_0_std_dist
atom_0_min_dist     atom_1_min_dist     atom_1_med_dist
nearby_C            nearby_N            x_0
mol_max_dist        atom_1_max_dist     x_1
atom_1_std_dist     mol_std_dist        mol_min_dist
atom_1_mean_dist    dist                atom_0_med_dist
atom_0_kur_dist     atom_0_max_dist     nearby_H
mol_mean_dist       y_0                 z_1
atom_1_kur_dist     y_1                 mol_med_dist
---------------------------------------------------------------------------
Returning model to generate fc.
0.1818044329818003
CPU times: user 3min 32s, sys: 2.62 s, total: 3min 35s
Wall time: 1min 11s


# Predicting Constants - Out-of-fold Features

In [14]:
def constant_model(X, feats, constant, verbose=True):
    assert constant in df.columns
    if verbose:
        print('TARGET:', constant)
        print(f'FEATURES (n = {len(X.columns)}):')
        plist(X.columns)
        print(75*'-')
    print(f'Returning model to generate {constant}.')
    rf = RandomForestRegressor(n_jobs=-1)
    gss = GroupShuffleSplit(n_splits=3)
    scores = cross_val_score(rf, X[feats], y, scoring='neg_mean_absolute_error',
                            groups=X['molecule_name'], cv=gss, n_jobs=-1)
    rf.fit(X[feats], y)
    print(np.mean(np.log(-scores)))
    return rf

In [None]:
%%time
X_final = featurize(pd.read_csv('../input/test.csv'), df_structures, categoricals)
X_final = reduce_mem_usage(X_final)
gc.collect()

In [None]:
# Split for training meta-features
X = df.loc[:, feats + ['molecule_name']]

In [None]:
target = 'fc'
y = df.loc[:, [target]]
%time fc_model = constant_model(X, feats, target)
df[f"{target}_pred"] = fc_model.predict(X[feats])
X_final[f"{target}_pred"] = fc_model.predict(X_final[feats])
importances(feats, fc_model)
del fc_model
gc.collect()

In [None]:
target = 'sd'
y = df.loc[:, [target]]
%time sd_model = constant_model(X, feats, target)
# save_model(sd_model, 'sd_model')
# Generate Meta-Feature on Train Data
df[f"{target}_pred"] = sd_model.predict(X[feats])
# Generate Meta-Feature on Submission Data
X_final[f"{target}_pred"] = sd_model.predict(X_final[feats])
del sd_model
gc.collect()

In [None]:
target = 'pso'
y = df.loc[:, [target]]
%time pso_model = constant_model(X, feats, target)
save_model(pso_model, 'pso_model')
# Generate Meta-Feature on Train Data
df[f"{target}_pred"] = pso_model.predict(X[feats])
# Generate Meta-Feature on Submission Data
X_final[f"{target}_pred"] = pso_model.predict(X_final[feats])
del pso_model
gc.collect()

In [None]:
target = 'dso'
y = df.loc[:, [target]]
%time dso_model = constant_model(X, feats, target)
# save_model(dso_model, 'dso_model')
# Generate Meta-Feature on Train Data
df[f"{target}_pred"] = dso_model.predict(X[feats])
# Generate Meta-Feature on Submission Data
X_final[f"{target}_pred"] = dso_model.predict(X_final[feats])
del dso_model
gc.collect()

In [None]:
meta_feats = ['fc_pred', 'sd_pred', 'pso_pred', 'dso_pred']
meta_orig = ['fc', 'sd', 'pso', 'dso']

In [None]:
del X, y
gc.collect()

## Train Model (Scalar Coupling Constant) using meta-feats to see if overfitting

In [None]:
%%time
# Split Target
final_targ = ['scalar_coupling_constant']
# Reduce Memory
df = reduce_mem_usage(df)
X_ = df.loc[:, feats + meta_feats + ['molecule_name']]
y_ = df.loc[:, final_targ]
gc.collect()

In [None]:
model = RandomForestRegressor(n_jobs=-1)
gss = GroupShuffleSplit(n_splits=3)
train_idxs, test_idxs = next(gss.split(X_['molecule_name'], y_, groups=X_['molecule_name']))
X_train, X_test, y_train, y_test = X_.iloc[train_idxs], X_.iloc[test_idxs], y_.iloc[train_idxs], y_.iloc[test_idxs]

In [None]:
%%time
model.fit(X_train[feats + meta_feats], y_train)
y_pred = model.predict(X_test[feats + meta_feats])
print(np.log(mean_absolute_error(y_test, y_pred)))

In [None]:
%%time
# Retrain on all data to save for later analysis
model.fit(X_[feats + meta_feats], y_)
y_pred = model.predict(X_[feats + meta_feats])

# Final Model

In [None]:
# Now Train Final Model, and use original constants instead of meta_features
X_ = df.loc[:, feats + meta_orig]
y_ = df.loc[:, final_targ]
model.fit(X_[feats + meta_orig], y_)

In [None]:
y_pred = model.predict(df.loc[:, feats + meta_feats])
print(np.log(mean_absolute_error(y_, y_pred))) # See performance on meta features

In [None]:
del X_, y_, X_train, X_test, y_train, y_test, df_orig
gc.collect()

# Submission

In [None]:
%%time
X_final_ids = X_final.loc[:, 'id']
X_final = X_final.drop(['id'], axis=1)

# Prepare for Predictions
X_final = X_final.loc[:, feats + meta_feats]
X_final = reduce_mem_usage(X_final)
gc.collect()

In [None]:
X_final = pd.DataFrame(np.nan_to_num(X_final[feats + meta_feats]), columns=feats + meta_feats)
print(X_final.columns)
pred = model.predict(X_final)

In [None]:
final_submission = pd.concat([X_final_ids, pd.Series(pred)], axis=1)
final_submission.columns = ['id', 'scalar_coupling_constant']

In [None]:
from IPython.display import HTML
import base64

def create_download_link(df, title = "Download CSV file", filename = "submission.csv"):  
    csv = df.to_csv(header=True, index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create_download_link(final_submission_xgb)
final_submission.to_csv('submission.csv', header=True, index=False)