# Hedge Fund X
This notebook solve the Hedge Fund X's competition challenge: Financial Modeling challenge.
This one is used to evaluate model, tuning param to find most sutable model.
To discover data set, check the other Notebook.

## Import library & Initialize data

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, ShuffleSplit
from sklearn import metrics
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.externals import joblib

from xgboost import XGBClassifier
import datetime



In [2]:
df = pd.read_csv("../input/hedge_fund_x/train.csv")
df_test = pd.read_csv("../input/hedge_fund_x/test.csv")
df.head(5)

Unnamed: 0,data_id,period,c1,c2,c3,c4,c5,c6,c7,c8,...,c80,c81,c82,c83,c84,c85,c86,c87,c88,target
0,2,train1,0.65557,-2.2e-05,-0.000539,-0.001075,0.0,0.0,0.21339,0.0,...,-0.023358,-0.017041,0.0,0.060697,0.0,0.0,0.0,-0.000202,-0.14022,1
1,3,train1,1.64643,-0.000292,-0.008367,0.009497,0.0,0.0,0.0,0.0,...,-0.059429,-0.009109,0.0,0.021645,0.0,0.0,0.0,-0.004382,0.455767,0
2,5,train1,-0.74301,0.004642,-0.000647,-0.00329,0.0,0.0,0.0,0.0,...,0.001796,-0.000104,0.0,-0.024718,0.0,0.0,0.219566,0.072711,1.15558,0
3,7,train1,0.02977,-0.006343,-0.000635,-0.002516,0.0,0.0,0.160313,0.0,...,-0.005501,0.045308,0.0,-0.148852,0.0,0.0,0.0,-0.101181,-0.954553,0
4,10,train1,-0.660243,0.012591,-0.002098,-0.022264,0.0,0.0,0.0,0.0,...,0.029034,-0.005847,0.0,-0.007073,0.0,0.0,0.0,-0.004842,0.436002,0


## Evalutating util functions

In [3]:
def print_val_score(scores, label):
    print "{}: {:.2f} (+/- {:.2f})".format(
        label,
        scores[label].mean(), 
        scores[label].std())

## Evaluate period prediction

### With all parameters

In [8]:
def print_val_score(scores, label):
    print "{}: {:.2f} (+/- {:.2f})".format(label, scores[label].mean(),
                                           scores[label].std())


def evaluate(est, train_df, excluded_cols=['period', 'target']):
    selected_cols = [
        col for col in train_df.columns if col not in excluded_cols
    ]
    X_train = train_df[selected_cols].values
    y_train = train_df['period'].values
    scoring = {'acc': 'accuracy', 'log_loss': 'neg_log_loss'}
    scores = cross_validate(
        estimator=est,
        X=X_train,
        y=y_train,
        cv=10,
        scoring=scoring,
        verbose=True)
    print_val_score(scores, 'train_acc')
    print_val_score(scores, 'test_acc')
    print_val_score(scores, 'train_log_loss')
    print_val_score(scores, 'test_log_loss')

In [5]:
clf_xgb_period = XGBClassifier(
    max_depth=7,
    n_estimators=150,
    reg_lambda=100,
    objective='multi:softmax',
    nthread=10)

In [6]:
train_df = pd.concat([df[x * 40000:x * 40000 + 1000] for x in range(0, 14)])
print train_df.shape

(14000, 91)


In [10]:
excluded_cols = ['data_id', 'period', 'target']
evaluate(clf_xgb_period, train_df, excluded_cols)

train_acc: 0.97 (+/- 0.00)
test_acc: 0.86 (+/- 0.01)
train_log_loss: -0.29 (+/- 0.00)
test_log_loss: -0.62 (+/- 0.02)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 14.6min finished


In [8]:
def evaluate_period(est, train_df, excluded_cols=['period', 'target']):
    selected_cols = [
        col for col in train_df.columns if col not in excluded_cols
    ]
    X_test = train_df[selected_cols].values
    y_test = train_df['target'].values
    y_pred = est.predict(X_test)
    y_pred_proba = clf_xgb.predict_proba(X_test)[:, 1]
    return y_pred, y_pred_proba, y_test

## Evaluate target prediction with 1 model

### One hot encoding

In [4]:
oh = OneHotEncoder()
le = LabelEncoder()
period_values = df['period']
le_period = le.fit_transform(period_values)
print le_period[90000:90005]
oh_period = oh.fit_transform(le_period.reshape(-1,1)).toarray()
print oh_period

[7 7 7 7 7]
[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [5]:
oh_period_df = pd.DataFrame(oh_period)
enc_df = pd.concat([df, oh_period_df], axis=1)
print enc_df.head(5)

   data_id  period        c1        c2        c3        c4   c5   c6  \
0        2  train1  0.655570 -0.000022 -0.000539 -0.001075  0.0  0.0   
1        3  train1  1.646430 -0.000292 -0.008367  0.009497  0.0  0.0   
2        5  train1 -0.743010  0.004642 -0.000647 -0.003290  0.0  0.0   
3        7  train1  0.029770 -0.006343 -0.000635 -0.002516  0.0  0.0   
4       10  train1 -0.660243  0.012591 -0.002098 -0.022264  0.0  0.0   

         c7   c8 ...     4    5    6    7    8    9   10   11   12   13  
0  0.213390  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.000000  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.000000  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.160313  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.000000  0.0 ...   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 105 columns]


### With Period prediction with all features

In [6]:
def evaluate_target_prediction(est, train_df, excluded_cols = ['period', 'target']):
    selected_cols = [col for col in train_df.columns if col not in excluded_cols]
    X_train = train_df[selected_cols].values
    y_train = train_df['target'].values

    scoring={'acc':'accuracy', 'log_loss':'neg_log_loss'}
    scores = cross_validate(estimator=est, X=X_train, y=y_train, cv=10, scoring=scoring, 
                            return_train_score=True)
    print_val_score(scores, 'train_acc')
    print_val_score(scores, 'test_acc')
    print_val_score(scores, 'train_log_loss')
    print_val_score(scores, 'test_log_loss')

In [7]:
train_df = pd.concat([enc_df[x*40000: x*40000 + 40000] for x in range(0,14)])
print train_df.shape

(560000, 105)


In [8]:
clf_xgb = XGBClassifier(max_depth=7, n_estimators=150, reg_lambda=100)

In [None]:
excluded_cols = ['data_id', 'period', 'target']
evaluate_target_prediction(clf_xgb, train_df, excluded_cols)

## Evaluate target prediction with 14 different models

### Train 14 target models

In [46]:
train_df = pd.concat([df[x * 40000:x * 40000 + 40000] for x in range(0, 14)])
print train_df.shape

(560000, 91)


In [47]:
periods = train_df['period'].unique()
print periods
X_periods = [train_df[train_df['period'] == p ] for p in periods]
print X_periods[1].head(5)

['train1' 'train2' 'train3' 'train4' 'train5' 'train6' 'train7' 'train8'
 'train9' 'train10' 'train11' 'train12' 'train13' 'train14']
       data_id  period        c1        c2        c3        c4        c5  \
40000   104722  train2 -1.451848 -0.013567 -0.004916 -0.009638  0.000000   
40001   104726  train2  0.056674  0.009975 -0.000008 -0.000356  0.000000   
40002   104728  train2  1.178704 -0.002399 -0.063818  0.060751  0.000000   
40003   104733  train2 -1.654064 -0.015497  0.004427 -0.014378  0.000000   
40004   104734  train2  0.347447  0.020259  0.013636  0.000040  0.297327   

           c6   c7   c8   ...         c80       c81  c82       c83  c84  c85  \
40000  0.0000  0.0  0.0   ...   -0.056786  0.018370  0.0  0.000176  0.0  0.0   
40001  0.0000  0.0  0.0   ...   -0.048978  0.084390  0.0 -0.044561  0.0  0.0   
40002  0.4453  0.0  0.0   ...   -0.133619  0.011723  0.0 -0.027450  0.0  0.0   
40003  0.0000  0.0  0.0   ...   -0.196004  0.037729  0.0 -0.006746  0.0  0.0   
40004  0.

In [49]:
def evaluate_target_prediction(est,
                               train_df,
                               excluded_cols=['period', 'target']):
    selected_cols = [
        col for col in train_df.columns if col not in excluded_cols
    ]
    X_train = train_df[selected_cols].values
    y_train = train_df['target'].values

    scoring = {'acc': 'accuracy', 'log_loss': 'neg_log_loss'}
    scores = cross_validate(
        estimator=est,
        X=X_train,
        y=y_train,
        cv=10,
        scoring=scoring,
        return_train_score=True)
    print_val_score(scores, 'train_acc')
    print_val_score(scores, 'test_acc')
    print_val_score(scores, 'train_log_loss')
    print_val_score(scores, 'test_log_loss')


excluded_cols = ['data_id', 'period', 'target']
clf_xgb_1of14 = XGBClassifier(
    max_depth=5, n_estimators=100, reg_lambda=100, min_child_weight=1,nthread=10
)
# TODO: test on 3 first period
for i, X in enumerate(X_periods[:]):
    print "Evaluate for period {}".format(periods[i])
    evaluate_target_prediction(clf_xgb_1of14, X, excluded_cols)

Evaluate for period train1
train_acc: 0.80 (+/- 0.00)
test_acc: 0.76 (+/- 0.01)
train_log_loss: -0.55 (+/- 0.00)
test_log_loss: -0.57 (+/- 0.00)
Evaluate for period train2
train_acc: 0.78 (+/- 0.00)
test_acc: 0.73 (+/- 0.01)
train_log_loss: -0.56 (+/- 0.00)
test_log_loss: -0.59 (+/- 0.00)
Evaluate for period train3
train_acc: 0.79 (+/- 0.00)
test_acc: 0.75 (+/- 0.01)
train_log_loss: -0.54 (+/- 0.00)
test_log_loss: -0.56 (+/- 0.00)
Evaluate for period train4
train_acc: 0.80 (+/- 0.00)
test_acc: 0.76 (+/- 0.01)
train_log_loss: -0.53 (+/- 0.00)
test_log_loss: -0.56 (+/- 0.00)
Evaluate for period train5
train_acc: 0.76 (+/- 0.00)
test_acc: 0.73 (+/- 0.01)
train_log_loss: -0.54 (+/- 0.00)
test_log_loss: -0.57 (+/- 0.00)
Evaluate for period train6
train_acc: 0.79 (+/- 0.00)
test_acc: 0.75 (+/- 0.01)
train_log_loss: -0.54 (+/- 0.00)
test_log_loss: -0.56 (+/- 0.00)
Evaluate for period train7
train_acc: 0.79 (+/- 0.00)
test_acc: 0.75 (+/- 0.01)
train_log_loss: -0.53 (+/- 0.00)
test_log_loss: -0

### Xgboost classifier 14 models

#### Define custom classifier

In [303]:
class Xgb14Classifier(BaseEstimator, ClassifierMixin):
    """
    The input for the classifier is dataframe to let us detect period
    """

    def __init__(self, excluded_cols=[], verbose=False):
        self._estimator_type = 'classifier'
        self.verbose = verbose
        self.excluded_cols = excluded_cols

    def fit(self, X, y):
        self.selected_cols = [
            col for col in X.columns
            if col not in (
                self.excluded_cols + ['data_id', 'period', 'target'])
        ]
        # y is required for cross validation purpose
        periods = X['period'].unique()
        X_periods = {p: X[X['period'] == p] for p in periods}
        self.clf_xgb_list = {}
        # Train 14 models
        for period in periods:
            X_period = X_periods[period]
            if self.verbose:
                print "Train period {} with {} data".format(
                    period, X_period.shape[0])
            X_train = X_period[self.selected_cols].values
            y_train = X_period['target'].values
            clf_xgb = XGBClassifier(
                max_depth=5,
                n_estimators=100,
                reg_lambda=100,
                min_child_weight=1,
                nthread=10)
            clf_xgb.fit(X_train, y_train)
            self.clf_xgb_list[period] = clf_xgb

            # Train period prediction
            #         if self.verbose: print "Train clf_xgb_period"
            #         clf_xgb_period = XGBClassifier(
            #             max_depth=7,
            #             n_estimators=150,
            #             reg_lambda=100,
            #             objective='multi:softmax',
            #             nthread=10)
            #         X_period_train = X[self.selected_cols].values
            #         y_period_train = X['period'].values
            #         clf_xgb_period.fit(X_period_train, y_period_train)
            #         self.clf_xgb_period = clf_xgb_period
            self.clf_xgb_period = joblib.load(
                'clf_xgb14_period_40000_all_param.pkl')

        return self

    def _predict_transform_period(self, X):
        if self.verbose: print "Predict period"
        try:
            getattr(self, "clf_xgb_period")
        except AttributeError:
            raise RuntimeError("Transformer hasn't been trained yet")
        X_train = X[self.selected_cols].values
        X_period = self.clf_xgb_period.predict(X_train)
        X_period_df = pd.DataFrame(X_period, columns=['period'])
        X_drop = X.reset_index(drop=True)
        if 'period' in X_drop.columns:
            X_drop = X_drop.drop('period', axis=1)
        return pd.concat([X_drop, X_period_df], axis=1)

    def predict_proba(self, X, y=None):
        if len(self.clf_xgb_list) == 0:
            print X['period'].unique()
            raise RuntimeError("No classifier is trained")
        X = self._predict_transform_period(X)
        periods = X['period'].unique()
        if (len(periods) > len(self.clf_xgb_list)):
            raise RuntimeError(
                "Test data has more period then trained classifiers")
        data_id_map = {
            data_id: idx
            for idx, data_id in enumerate(X['data_id'].values)
        }
        X_periods = {p: X[X['period'] == p] for p in periods}
        rst = np.empty((len(data_id_map), 2))
        for p in periods:
            if self.verbose: print "Predict period {}".format(p)
            X_period = X_periods[p]
            clf = self.clf_xgb_list[p]
            X_test = X_period[self.selected_cols].values
            pred = clf.predict_proba(X_test)
            for idx, data_id in enumerate(X_period['data_id']):
                rst[data_id_map[data_id]] = pred[idx]

        return rst

    def predict(self, X, y=None):
        if len(self.clf_xgb_list) == 0:
            raise RuntimeError("No classifier is trained")
        X = self._predict_transform_period(X)
        periods = X['period'].unique()
        if (len(periods) > len(self.clf_xgb_list)):
            print X['period'].unique()
            raise RuntimeError(
                "Test data has more period then trained classifiers")
        data_id_map = {
            data_id: idx
            for idx, data_id in enumerate(X['data_id'].values)
        }
        X_periods = {p: X[X['period'] == p] for p in periods}
        rst = np.empty(len(data_id_map))
        for p in periods:
            if self.verbose: print "Predict period {}".format(p)
            X_period = X_periods[p]
            clf = self.clf_xgb_list[p]
            X_test = X_period[self.selected_cols].values
            pred = clf.predict(X_test)
            for idx, data_id in enumerate(X_period['data_id']):
                rst[data_id_map[data_id]] = pred[idx]

        return rst

#### Check the custom classifier

In [270]:
train_df = pd.concat([df[x * 40000:x * 40000 + 100]
                      for x in range(0, 14)]).reset_index(drop=True)
print train_df.shape

(1400, 91)


In [277]:
clf_xgb_14 = Xgb14Classifier()
clf_xgb_14.fit(train_df, train_df['target'].values)

Xgb14Classifier(excluded_cols=[], verbose=False)

In [272]:
val_df = pd.concat([df[x * 40000 + 1000:x * 40000 + 2000] 
                    for x in range(0, 14)]).reset_index(drop=True)
print val_df.shape

(14000, 91)


In [273]:
y_pred = clf_xgb_14.predict_proba(val_df)
print y_pred.shape

(14000, 2)


#### Cross-validation the custom classifier

In [327]:
train_df = pd.concat([df[x * 40000:x * 40000 + 40000]
                      for x in range(0, 14)]).reset_index(drop=True)
print train_df.shape

(560000, 91)


In [None]:
scoring = {'acc': 'accuracy', 'log_loss': 'neg_log_loss'}
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=3)
clf_xgb_14 = Xgb14Classifier(verbose=True)
scores = cross_validate(estimator=clf_xgb_14, X=train_df, y=train_df['target'].values, cv=cv, scoring=scoring)

print_val_score(scores, 'train_acc')
print_val_score(scores, 'test_acc')
print_val_score(scores, 'train_log_loss')
print_val_score(scores, 'test_log_loss')

Train period train13 with 27918 data
Train period train6 with 28036 data
Train period train1 with 27966 data
Train period train2 with 28014 data
Train period train12 with 27960 data
Train period train11 with 28035 data
Train period train10 with 27859 data
Train period train4 with 27885 data
Train period train9 with 28054 data
Train period train8 with 28090 data
Train period train5 with 28035 data
Train period train3 with 28062 data
Train period train14 with 27970 data
Train period train7 with 28116 data
Predict period
Predict period train9
Predict period train10
Predict period train2
Predict period train5
Predict period train12
Predict period train11
Predict period train6
Predict period train4
Predict period train8
Predict period train1
Predict period train14
Predict period train13
Predict period train7
Predict period train3
Predict period
Predict period train9
Predict period train10
Predict period train2
Predict period train5
Predict period train12
Predict period train11
Predict perio

In [285]:
joblib.dump(clf_xgb_14, 'clf_xgb14_1000_all_param.pkl')

['clf_xgb14_1000_all_param.pkl']

### Xgb period transformer to predict period and chain to pipeline
(Not used)

In [238]:
class XgbPeriodClassifier(BaseEstimator, TransformerMixin):
    def __init__(self, excluded_cols=[], verbose=False):
        self.verbose = verbose
        self.excluded_cols = excluded_cols
        self.selected_cols = [
            col for col in train_df.columns
            if col not in (
                self.excluded_cols + ['data_id', 'period', 'target'])
        ]

    def fit(self, X, *_):
        clf_xgb_period = XGBClassifier(
            max_depth=7,
            n_estimators=150,
            reg_lambda=100,
            objective='multi:softmax',
            nthread=10)
        X_train = X[self.selected_cols].values
        y_train = X['period'].values
        clf_xgb_period.fit(X_train, y_train)
        self.clf_xgb_period = clf_xgb_period
        return self

    def transform(self, X, y=None):
        print y
        try:
            getattr(self, "clf_xgb_period")
        except AttributeError:
            raise RuntimeError("Transformer hasn't been trained yet")
        # Don't predict period if it's training data
        if 'period' in X.columns:
            return X
        X_train = X[self.selected_cols].values
        X_period = self.clf_xgb_period.predict(X_train)
        X_period_df = pd.DataFrame(X_period, columns=['period'])
        return pd.concat([X, X_period_df], axis=1)

In [239]:
train_df = pd.concat([df[x * 40000:x * 40000 + 100] for x in range(0, 14)])
print train_df.shape

(1400, 91)


In [240]:
xgb_period_transformer = XgbPeriodClassifier(verbose=True)
xgb_period_transformer.fit(train_df)

XgbPeriodClassifier(excluded_cols=[], verbose=True)

In [241]:
val_df = pd.concat(
    [df[x * 40000 + 1000:x * 40000 + 2000] for x in range(0, 14)]).drop(
        'period', axis=1).reset_index(drop=True)
print val_df.shape

(14000, 90)


In [219]:
val_new_df = xgb_period_transformer.transform(val_df)
print val_new_df.shape
print val_new_df['period'].describe()

(14000, 1)
(14000, 90)
(14000, 91)
count      14000
unique        14
top       train8
freq        1097
Name: period, dtype: object


## Predict test dataset

In [310]:
train_df = df
print train_df.shape

(560000, 91)


In [311]:
clf_xgb_14_final = Xgb14Classifier(verbose=True)

In [312]:
clf_xgb_14_final.fit(train_df, train_df['target'])

Train period train1 with 40000 data
Train period train2 with 40000 data
Train period train3 with 40000 data
Train period train4 with 40000 data
Train period train5 with 40000 data
Train period train6 with 40000 data
Train period train7 with 40000 data
Train period train8 with 40000 data
Train period train9 with 40000 data
Train period train10 with 40000 data
Train period train11 with 40000 data
Train period train12 with 40000 data
Train period train13 with 40000 data
Train period train14 with 40000 data


Xgb14Classifier(excluded_cols=[], verbose=True)

In [299]:
joblib.dump(clf_xgb_14_final, 'clf_xgb14_40000_all_param.pkl')

['clf_xgb14_40000_all_param.pkl']

In [302]:
joblib.dump(clf_xgb_14_final.clf_xgb_period, 'clf_xgb14_period_40000_all_param.pkl')

['clf_xgb14_period_40000_all_param.pkl']

In [313]:
pred = clf_xgb_14_final.predict_proba(df_test)
print pred.shape

Predict period
Predict period train8
Predict period train2
Predict period train1
Predict period train12
Predict period train14
Predict period train10
Predict period train5
Predict period train6
Predict period train9
Predict period train7
Predict period train3
Predict period train4
Predict period train11
Predict period train13
(361500, 2)


In [322]:
df_test_sample = df[:10]
print df_test_sample
pred_sample = clf_xgb_14_final.predict_proba(df_test_sample)
print pred_sample

   data_id  period        c1        c2        c3        c4   c5   c6  \
0        2  train1  0.655570 -0.000022 -0.000539 -0.001075  0.0  0.0   
1        3  train1  1.646430 -0.000292 -0.008367  0.009497  0.0  0.0   
2        5  train1 -0.743010  0.004642 -0.000647 -0.003290  0.0  0.0   
3        7  train1  0.029770 -0.006343 -0.000635 -0.002516  0.0  0.0   
4       10  train1 -0.660243  0.012591 -0.002098 -0.022264  0.0  0.0   
5       13  train1  0.950848  0.011206  0.000272 -0.013013  0.0  0.0   
6       17  train1 -1.160782 -0.008341  0.002040  0.003845  0.0  0.0   
7       22  train1  1.160960 -0.000365  0.005032 -0.007527  0.0  0.0   
8       25  train1 -0.719393  0.014472 -0.000373  0.025222  0.0  0.0   
9       26  train1 -0.838567 -0.022329  0.002009 -0.000680  0.0  0.0   

         c7   c8   ...         c80       c81       c82       c83  c84  c85  \
0  0.213390  0.0   ...   -0.023358 -0.017041  0.000000  0.060697  0.0  0.0   
1  0.000000  0.0   ...   -0.059429 -0.009109  0.000

In [323]:
print pred.shape

(361500, 2)


In [324]:
predictions = pred[:,0]
submission = pd.DataFrame({'data_id': df_test['data_id'],'target': predictions})
submission.to_csv("submit_{:%Y%m%d-%H%M}.csv".format(datetime.datetime.now()), index=False)