In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tools import *

import os
import warnings
warnings.filterwarnings('ignore')

import gc
import joblib

from itertools import compress

# Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupShuffleSplit, cross_val_score

CACHE_PATH = '/Users/angus/Downloads/kaggle/'
cpath = lambda x: CACHE_PATH + x

In [7]:
from sklearn.feature_selection import SelectFromModel

In [2]:
df = pd.read_pickle(cpath('df_feats_enc'))
df_structures = pd.read_csv('../input/structures.csv');

In [3]:
df = get_portion(df, r=0.25)

In [5]:
feats = [
    'molecule_name', 'atom_index_0', 'atom_index_1', 'scalar_coupling_constant',
#     'fc', 'sd', 'pso', 'dso',
#     'mulliken_charge',
#     'dip_x', 'dip_y', 'dip_z',
#     'potential_energy',
#     'XX', 'YX', 'ZX', 'XY','YY', 'ZY', 'XZ', 'YZ', 'ZZ',
#     'atom_0',
    'x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1',
    'dist',
    'n_C', 'n_H', 'n_N',
    'mol_min_dist', 'mol_max_dist', 'mol_mean_dist', 'mol_med_dist', 'mol_kur_dist', 'mol_std_dist',
    'atom_0_min_dist', 'atom_0_max_dist', 'atom_0_mean_dist', 'atom_0_med_dist', 'atom_0_kur_dist', 'atom_0_std_dist',
    'atom_1_min_dist', 'atom_1_max_dist', 'atom_1_mean_dist', 'atom_1_med_dist', 'atom_1_kur_dist', 'atom_1_std_dist',
    'nearby_C', 'nearby_H', 'nearby_N',
    'type',
    'atom_1',
]
meta_feats = ['fc_pred', 'sd_pred', 'pso_pred', 'dso_pred']
categoricals = ['type', 'atom_1']
info_cols = ['molecule_name', 'atom_index_0', 'atom_index_1', 'scalar_coupling_constant']
feats = list(set(feats) - set(categoricals) - set(info_cols))

# For Each Level-1 Model

In [6]:
%%time
target = 'fc'
X = df.loc[:, feats + ['molecule_name']]
y = df.loc[:, [target]]

fc_model = constant_model(X, y, feats, target)
print(importances(feats, fc_model))

TARGET: fc
FEATURES (n = 32):
mol_max_dist        y_0                 x_1
atom_1_kur_dist     x_0                 atom_0_kur_dist
atom_1_max_dist     atom_1_min_dist     mol_min_dist
mol_std_dist        nearby_H            y_1
z_1                 atom_1_mean_dist    nearby_C
mol_med_dist        nearby_N            dist
n_N                 atom_0_min_dist     atom_1_med_dist
atom_0_mean_dist    n_H                 n_C
atom_1_std_dist     mol_mean_dist       mol_kur_dist
z_0                 atom_0_med_dist     atom_0_std_dist
---------------------------------------------------------------------------
Returning model to generate fc.
0.08722000076402714
                  importance
dist                0.937973
atom_1_min_dist     0.029290
atom_0_min_dist     0.008179
nearby_N            0.007167
nearby_C            0.006579
atom_0_std_dist     0.001321
atom_0_max_dist     0.001117
atom_1_max_dist     0.000952
atom_1_med_dist     0.000945
atom_0_med_dist     0.000650
atom_1_kur_dist     0.0

# Functionalize

In [11]:
def select_features(model, X, y, feats, threshold=0.0001):
    sfm = SelectFromModel(model, threshold=threshold)
    sfm.fit(X[feats], y)
    n_features = sfm.transform(X[feats]).shape[1]
    while 2*n_features > len(feats):
        sfm.threshold += 0.0001
        X_transform = sfm.transform(X[feats])
        n_features = X_transform.shape[1]
        
    print(f'Reduced from {len(feats)} to {n_features}.')
    feats_selected = list(compress(feats, sfm.get_support()))
    print(feats_selected)
    return feats_selected

In [12]:
%%time
rf = RandomForestRegressor(n_jobs=1)
feats_selected = select_features(rf, X, y, feats)

Reduced from 31 to 14.
['atom_1_kur_dist', 'atom_0_kur_dist', 'atom_1_max_dist', 'atom_1_min_dist', 'nearby_C', 'nearby_N', 'dist', 'atom_0_min_dist', 'atom_1_med_dist', 'atom_0_mean_dist', 'n_H', 'atom_0_med_dist', 'atom_0_std_dist', 'atom_0_max_dist']
CPU times: user 5min 21s, sys: 2.72 s, total: 5min 24s
Wall time: 14min 42s


### Assess the performance after feature selection

In [13]:
%%time
target = 'fc'
fc_model_ = constant_model(X[feats_selected + ['molecule_name']], y, feats_selected, target)
print(importances(feats_selected, fc_model_))

TARGET: fc
FEATURES (n = 15):
atom_1_kur_dist     atom_0_kur_dist     atom_1_max_dist
atom_1_min_dist     nearby_C            nearby_N
dist                atom_0_min_dist     atom_1_med_dist
atom_0_mean_dist    n_H                 atom_0_med_dist
---------------------------------------------------------------------------
Returning model to generate fc.
0.03914798675902478
                  importance
dist                0.939144
atom_1_min_dist     0.029687
nearby_C            0.010077
atom_0_min_dist     0.007283
nearby_N            0.003817
atom_0_std_dist     0.001556
atom_1_med_dist     0.001445
atom_0_max_dist     0.001350
atom_1_max_dist     0.001269
atom_1_kur_dist     0.001109
atom_0_med_dist     0.000886
atom_0_kur_dist     0.000863
atom_0_mean_dist    0.000831
n_H                 0.000684
CPU times: user 4min 10s, sys: 2.06 s, total: 4min 12s
Wall time: 1min 26s
