In [1]:
import numpy as np
import pandas as pd
from tools import *
import os
import warnings
warnings.filterwarnings('ignore')
import gc

In [2]:
CACHE_PATH = '/Users/angus/Downloads/kaggle/'
cpath = lambda x: CACHE_PATH + x

In [8]:
df_orig = pd.read_pickle(cpath('base_data')) #pd.read_csv('../input/train.csv', index_col='id')
df_structures = pd.read_csv('../input/structures.csv');
df_scc = pd.read_csv('../input/scalar_coupling_contributions.csv');

In [4]:
join_keys = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type']
df_orig = pd.merge(df_orig, df_scc, how ='left', on=join_keys)
del df_scc

In [5]:
df_mull = pd.read_csv('../input/mulliken_charges.csv')
join_keys = ['molecule_name', 'atom_index_0']
df_orig = pd.merge(df_orig, df_mull, how='left', left_on=join_keys, right_on=['molecule_name', 'atom_index'])
df_orig = df_orig.drop(labels=['atom_index'], axis='columns')
del df_mull

In [6]:
df_dip = pd.read_csv('../input/dipole_moments.csv')
df_dip.columns = ['molecule_name','dip_x', 'dip_y', 'dip_z']
df_orig = pd.merge(df_orig, df_dip, how='left', on=['molecule_name'])
del df_dip

In [7]:
df_pe = pd.read_csv('../input/potential_energy.csv')
df_orig = pd.merge(df_orig, df_pe, how='left', on=['molecule_name'])
del df_pe

In [8]:
df_mag = pd.read_csv('../input/magnetic_shielding_tensors.csv')
join_keys = ['molecule_name', 'atom_index_0']
df_orig = pd.merge(df_orig, df_mag, how='left', left_on=join_keys, right_on=['molecule_name', 'atom_index'])
df_orig = df_orig.drop(labels=['atom_index'], axis='columns')
del df_mag

In [11]:
assert not df_orig.isnull().any().any()

In [12]:
df_orig = reduce_mem_usage(df_orig)
df_structures = reduce_mem_usage(df_structures)

Mem. usage decreased to 284.31 Mb (0.0% reduction)
Mem. usage decreased to 51.74 Mb (52.1% reduction)


In [13]:
df = df_orig
gc.collect()

503

In [14]:
df.columns

Index(['molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'mulliken_charge',
       'dip_x', 'dip_y', 'dip_z', 'potential_energy', 'XX', 'YX', 'ZX', 'XY',
       'YY', 'ZY', 'XZ', 'YZ', 'ZZ'],
      dtype='object')

In [15]:
%time df = featurize(df, df_structures)

CPU times: user 7min 13s, sys: 23 s, total: 7min 36s
Wall time: 5min 49s


In [16]:
df.to_pickle(cpath('df_feats'))

In [17]:
%time df = encode_cols(df, ['atom_1', 'type'])

CPU times: user 29.2 s, sys: 3.3 s, total: 32.5 s
Wall time: 19.7 s


In [18]:
df.to_pickle(cpath('df_feats_enc'))

In [19]:
df.columns

Index(['molecule_name', 'atom_index_0', 'atom_index_1',
       'scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso', 'mulliken_charge',
       'dip_x', 'dip_y', 'dip_z', 'potential_energy', 'XX', 'YX', 'ZX', 'XY',
       'YY', 'ZY', 'XZ', 'YZ', 'ZZ', 'atom_0', 'x_0', 'y_0', 'z_0', 'x_1',
       'y_1', 'z_1', 'dist', 'n_C', 'n_H', 'n_N', 'mol_min_dist',
       'mol_max_dist', 'mol_mean_dist', 'mol_med_dist', 'mol_kur_dist',
       'mol_std_dist', 'atom_0_min_dist', 'atom_0_max_dist',
       'atom_0_mean_dist', 'atom_0_med_dist', 'atom_0_kur_dist',
       'atom_0_std_dist', 'atom_1_min_dist', 'atom_1_max_dist',
       'atom_1_mean_dist', 'atom_1_med_dist', 'atom_1_kur_dist',
       'atom_1_std_dist', 'nearby_C', 'nearby_H', 'nearby_N', 'atom_1_C',
       'atom_1_H', 'atom_1_N', 'type_1JHC', 'type_1JHN', 'type_2JHC',
       'type_2JHH', 'type_2JHN', 'type_3JHC', 'type_3JHH', 'type_3JHN'],
      dtype='object')

# Submission

In [None]:
%%time
X_final_ids = X_final.loc[:, 'id']
X_final = X_final.drop(['id'], axis=1)

# Prepare for Predictions
X_final = X_final.loc[:, feats + meta_feats]
X_final = reduce_mem_usage(X_final)
gc.collect()

In [None]:
X_final = pd.DataFrame(np.nan_to_num(X_final[feats + meta_feats]), columns=feats + meta_feats)
print(X_final.columns)
pred = model.predict(X_final)

In [None]:
final_submission = pd.concat([X_final_ids, pd.Series(pred)], axis=1)
final_submission.columns = ['id', 'scalar_coupling_constant']

In [None]:
from IPython.display import HTML
import base64

def create_download_link(df, title = "Download CSV file", filename = "submission.csv"):  
    csv = df.to_csv(header=True, index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create_download_link(final_submission_xgb)
final_submission.to_csv('submission.csv', header=True, index=False)