[View in Colaboratory](https://colab.research.google.com/github/Hoiy/kaggle-santander-value-prediction-challenge/blob/master/model_magic.ipynb)

In [1]:
import dotenv
import os
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import normalize, minmax_scale
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from tqdm import tqdm

dotenv.load_dotenv('.env')

True

In [2]:
!mkdir prep
!gsutil rsync gs://{os.environ['GCP_BUCKET']}/prep prep

Building synchronization state...
Starting synchronization...
Copying gs://kaggle-195720-santander-value-prediction-challenge/prep/test.csv...
Copying gs://kaggle-195720-santander-value-prediction-challenge/prep/test.csv.gz...
Copying gs://kaggle-195720-santander-value-prediction-challenge/prep/test_log_feats.snappy.parquet...
Copying gs://kaggle-195720-santander-value-prediction-challenge/prep/test_log_stats.snappy.parquet...

==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m -o ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://kaggle-195720-santander-value-prediction-challenge/prep/test_raw_feats.snappy.parquet...
Copying gs://kaggle-195720-santander-value-prediction-challenge/prep/train.csv...
Copying gs://kaggle-195720-santander-value-prediction-challenge/prep/train.csv.gz...
Copying gs://kaggle-195720-santander-value-pred

In [0]:
test_raw_feats = pd.read_parquet('./prep/test_raw_feats.snappy.parquet')
df = test_raw_feats

MAGIC_COLS = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
  '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
  'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
  '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212',  '66ace2992',
  'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
  '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
  '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2',  '0572565c2',
  '190db8488',  'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98'
]

In [0]:
def count_vectorize(row_vecs):
  return pd.DataFrame([row_vecs.loc[row].value_counts() for row in row_vecs.index]).fillna(0)

# def cols_count_vectorize(df):
#   return 

# def rows_count_vectorize(df):
#   return cols_count_vectorize(df.T)

def dist(s1, s2):
  return sum(np.abs(s1-s2))

def col_metrics(cols, metric_func):
  return pd.DataFrame([[metric_func(cols[i], cols[j]) for i in cols] for j in cols], columns=cols.columns, index=cols.columns)

def row_metrics(rows, metric_func):
  return col_metrics(rows.T, metric_func)

def cloest_vec(row_vecs, vec, cutoff=0, metric_func=lambda x, y: sum((x-y)**2)):
  return row_vecs.apply(lambda x: metric_func(x, vec), axis=1)

def visualize_col(df, **kwargs):
  col_enc = cols_count_vectorize(df)
  col_enc = col_enc.drop(columns=[filled_value, 0])
  col_enc = col_enc / col_enc.max()
  
  tsne = TSNE(n_components=2, verbose=1)
  tsne_result = pd.DataFrame(tsne.fit_transform(col_enc), index=col_enc.index)
  
  dbscan = DBSCAN(**kwargs)
  dbscan_result = pd.Series(dbscan.fit_predict(tsne_result), index=tsne_result.index)
  
  ax = tsne_result.plot.scatter(x=0, y=1, alpha=0.8, figsize=(15, 10), color=dbscan_result, cmap='tab20')
  for x, y, s in zip(tsne_result[0], tsne_result[1], tsne_result.index):
    ax.annotate(xy=(x,y), s=s) 
  return tsne_result

def pca(row_vecs):
  from sklearn.decomposition import PCA
  
  print('pca...')
  pca = PCA(n_components=32)
  return pd.DataFrame(pca.fit_transform(row_vecs), index=row_vecs.index)
  

def tsne(row_vecs, **kwargs):
  from sklearn.manifold import TSNE
  print('tsne...')
        
  tsne = TSNE(n_components=2, verbose=1, **kwargs)
  return pd.DataFrame(tsne.fit_transform(row_vecs), index=row_vecs.index)

def taxicab_dist(x, y):
  return np.sum(np.abs(x-y))

def shift_dist(x, y):
  return np.sum(x.shift() != y)

def dbscan(row_vecs, **kwargs):
  from sklearn.cluster import DBSCAN
  
  print('dbscan...')
  dbscan = DBSCAN(eps=5, **kwargs)
  return pd.Series(dbscan.fit_predict(row_vecs), index=row_vecs.index)


def plot_cluster(tsne_result, dbscan_result, labels=[]):
  ax = tsne_result.plot.scatter(x=0, y=1, alpha=0.5, figsize=(15, 10), color=dbscan_result, cmap='tab20')
  for l in labels:
    ax.plot(tsne_result.loc[l][0], tsne_result.loc[l][1], 'ro')
    ax.annotate(xy=tsne_result.loc[l], s=l)

    
def search_next_row(rows, row, period=1):
  row = row.shift(period)
  dist = (~rows.eq(row)).sum(axis=1)
  indices = dist[dist==np.abs(period)].index
  if len(indices) == 1:
    return indices[0]
  else:
    return None

  
def find_ordered_index(df, seed_index, max_period=5):
  ordered_index = [seed_index]
  lag = [0]

  while True:
    clean_df = df.drop(ordered_index)
    
    period = 0
    while True:
      period = period + 1
      index = search_next_row(clean_df, df.loc[ordered_index[-1]], period)
      if index or period == max_period:
        break
    
    if not index:
      break
      
    ordered_index.append(index)
    lag.append(period)

    
  while True:
    clean_df = df.drop(ordered_index)
    
    period = 0
    while True:
      period = period - 1
      index = search_next_row(clean_df, df.loc[ordered_index[0]], period)
      if index or period == -max_period:
        break
    
    if not index:
      break
    
    lag = [0] + [-period] + lag[1:]
    ordered_index = [index] + ordered_index
      
  return ordered_index, lag


In [5]:
df2 = df[MAGIC_COLS]
batches = []
lags = []

total = df2.shape[0]

with tqdm(total=total) as pbar:
  while len(df2.index) > 0:
    pbar.update(total - len(df2.index) - pbar.n)    
    batch, lag = find_ordered_index(df2, df2.index[0], max_period=37)
    if len(batches) >= 3:
      batches.append(batch)
      lags.append(lag)
    df2.drop(batch, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
100%|█████████▉| 49341/49342 [9:22:39<00:00,  1.46it/s]


In [0]:
import pickle

with open('test_row_batch.pkl', 'wb') as f:
  pickle.dump([batches, lags], f)
  
!mkdir model
!gsutil cp model/test_row_batch.pkl gs://{os.environ['GCP_BUCKET']}/model/test_row_batch.pkl

In [20]:
sum([len(batch) for batch in batches if len(batch) >= 3])

5390

In [0]:
pd.options.display.float_format = '{:.2f}'.format


def pred(batch, lag):
    batch_df = df.loc[batch][['target']+MAGIC_COLS]
    new_batch = []
    for i in range(len(lag)):
      for j in range(lag[i]-1, 0, -1):
        batch_df.loc[batch[i]+'_expand_%d'%j] = batch_df[MAGIC_COLS].loc[batch[i]].shift(-j).fillna(0.)
        new_batch += [batch[i]+'_expand_%d'%j]
      new_batch += [batch[i]]
        
    batch_df = batch_df.reindex(new_batch)
    batch_df['pred'] = batch_df[MAGIC_COLS[0]].shift(-2).fillna(0.)
    return batch_df.loc[batch][['target', 'pred']+MAGIC_COLS]

  
df['pred'] = pd.read_parquet()
  
for batch, lag in zip(batches, lags):
    res = pred(batch, lag)
    df.loc[res.index, 'pred'] = res['pred']

In [0]:
df['pred']