# Data

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV # important!!!!!!!!!!!!!!!!!!!!!!!!!!!!
from sklearn.model_selection import TimeSeriesSplit 
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [2]:
df = pd.read_pickle('data_100.pickle')
items = pd.read_pickle('fdump_100.pickle')
df = df.query('defined_target_vb200_vf2_volatility_VWAP_200 == 1')

In [3]:
feats_dict3 = {'gram':[c for c in df.columns if c.startswith('gram')],
              'pos':[c for c in df.columns if c.startswith('pos')],
              'title_tfidf':[c for c in df.columns if c.startswith('title_tfidf')],
              'source_bow':[c for c in df.columns if c.startswith('source_bow')],
              'Pre_Doc2Vec': [c for c in df.columns if c.startswith('Pre_Doc2Vec')],
              'pol_sub':[c for c in df.columns if c.startswith('polarity') | c.startswith('subje')],
              'tfidf':[c for c in df.columns if c.startswith('tfidf')]}

gram = feats_dict3['gram']
pos = feats_dict3['pos']
title = feats_dict3['title_tfidf']
source = feats_dict3['source_bow']
Doc2Vec = feats_dict3['Pre_Doc2Vec']
pol_sub = feats_dict3['pol_sub']
tfidf = feats_dict3['tfidf']

In [4]:
df.shape #(5193-3500=1693)

#[:3600] [3600:4630][4630:]

(3430, 971)

#Only nlp

## training functions


In [8]:
#-------------------------------------------------------------
def get_data(data, feats_col, target_col):

    X = data[feats_col].apply(np.float32)
   
    X = X.values
    targets = ['target_{}'.format(target_col)]
    targets_dataframe = df[targets]
    targets_dataframe[targets] = np.where(targets_dataframe[targets]!=-1,targets_dataframe[targets],float(2))
    y = targets_dataframe.values
    
    c, r = y.shape
    y = y.reshape(c,)
    
    
    return X, y
  

#-------------------------------------------------------------
# -------------------------------------------------------------
def cv_generator(n_cv):
    
    N = 2758 #4089 [0:3600] [3600:4630] [4630:]
    
    n_train = int(np.floor(N/(1+0.3*n_cv)))
    n_test = int(np.floor(0.3*n_train))
    
    for cv_idx in range(n_cv):
        
        train_idx = [i for i in range(cv_idx*n_test, cv_idx*n_test +n_train)] 
        test_idx = [i for i in range(cv_idx*n_test +n_train, (cv_idx+1)*n_test +n_train)] 
        
        yield (train_idx, test_idx)
        
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.preprocessing import label_binarize
from scipy import interp
def AUC(y_true,y_probas):
    
#     print(y_probas)
    y_true = np.array(y_true)
    y_probas = np.array(y_probas)


    classes = np.unique(y_true)
    probas = y_probas

    fpr = {}
    tpr = {}
    roc_auc = {}
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_true, probas[:, i],
                                      pos_label=classes[i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    average_score = (roc_auc[1]+roc_auc[2])/2
    return average_score
  
  
  
  
  
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.preprocessing import label_binarize
from scipy import interp

title_fontsize="small"
text_fontsize="small"
curves=('micro', 'macro', 'each_class')
cmap='nipy_spectral'


def plot_AUC(true_y,predict,title):


    plt.figure()
    y_true = true_y
    y_probas = predict
    y_true = np.array(y_true)
    y_probas = np.array(y_probas)

    if 'micro' not in curves and 'macro' not in curves and \
            'each_class' not in curves:
        raise ValueError('Invalid argument for curves as it '
                         'only takes "micro", "macro", or "each_class"')

    classes = np.unique(y_true)
    probas = y_probas

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_true, probas[:, i],
                                      pos_label=classes[i])
        roc_auc[i] = auc(fpr[i], tpr[i])
       

    # Compute micro-average ROC curve and ROC area
    micro_key = 'micro'
    i = 0
    while micro_key in fpr:
        i += 1
        micro_key += str(i)

    y_true = label_binarize(y_true, classes=classes)
    if len(classes) == 2:
        y_true = np.hstack((1 - y_true, y_true))

    fpr[micro_key], tpr[micro_key], _ = roc_curve(y_true.ravel(),
                                                  probas.ravel())
    roc_auc[micro_key] = auc(fpr[micro_key], tpr[micro_key])

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[x] for x in range(len(classes))]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(len(classes)):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= len(classes)

    macro_key = 'macro'
    i = 0
    while macro_key in fpr:
        i += 1
        macro_key += str(i)
    fpr[macro_key] = all_fpr
    tpr[macro_key] = mean_tpr
    roc_auc[macro_key] = auc(fpr[macro_key], tpr[macro_key])
   

    title = title
    plt.title(title, fontsize=title_fontsize)

#         if 'each_class' in curves:
    for i in range(len(classes)):
        color = plt.cm.get_cmap(cmap)(float(i) / len(classes))
        plt.plot(fpr[i], tpr[i], lw=2, color=color,
                label='ROC curve of class {0} (area = {1:0.2f})'
                ''.format(classes[i], roc_auc[i]))

#         if 'micro' in curves:
    plt.plot(fpr[micro_key], tpr[micro_key],
            label='micro-average ROC curve '
                  '(area = {0:0.2f})'.format(roc_auc[micro_key]),
            color='deeppink', linestyle=':', linewidth=4)

#         if 'macro' in curves:
    plt.plot(fpr[macro_key], tpr[macro_key],
            label='macro-average ROC curve '
                  '(area = {0:0.2f})'.format(roc_auc[macro_key]),
            color='navy', linestyle=':', linewidth=4)

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=text_fontsize)
    plt.ylabel('True Positive Rate', fontsize=text_fontsize)
    plt.tick_params(labelsize=text_fontsize)
    plt.legend(loc='lower right', fontsize=text_fontsize)
    plt.show()
    
    
#----------------------------------------
def precision(y_true,y_probas):
    y_true = np.array(y_true)
    y_probas = np.array(y_probas)
    classes = np.unique(y_true)
    probas = y_probas
    y_true = label_binarize(y_true, classes=classes)
    
    average_precision = {}
    for i in range(len(classes)):
        average_precision[i] = average_precision_score(y_true[:, i], probas[:, i])
        
    ave_precision = (average_precision[1]+average_precision[2])/2

    return ave_precision
    
def recall(y_true,y_probas):
    y_true = np.array(y_true)
    y_probas = np.array(y_probas)
    classes = np.unique(y_true)
    probas = y_probas
    y_true = label_binarize(y_true, classes=classes)
    
    recall = {}
    for i in range(len(classes)):
        recall[i] = recall_score(y_true[:, i], probas[:, i].round())
        
    ave_recall = (recall[1]+recall[2])/2

    return ave_recall


def accuracy(y_true,y_probas):
    y_true = np.array(y_true)
    y_probas = np.array(y_probas)
    classes = np.unique(y_true)
    probas = y_probas
    y_true = label_binarize(y_true, classes=classes)
    
    acc = {}
    for i in range(len(classes)):
        acc[i] = accuracy_score(y_true[:, i], probas[:, i].round())
        
    ave_acc = (acc[1]+acc[2])/2
    
    return ave_acc

## single nlp

In [2]:
output_dict3={}

feats_dict3 = {'gram':[c for c in df.columns if c.startswith('gram')],
              'pos':[c for c in df.columns if c.startswith('pos')],
              'title_tfidf':[c for c in df.columns if c.startswith('title_tfidf')],
              'source_bow':[c for c in df.columns if c.startswith('source_bow')],
              'Pre_Doc2Vec': [c for c in df.columns if c.startswith('Pre_Doc2Vec')],
              'pol_sub':[c for c in df.columns if c.startswith('polarity') | c.startswith('subje')],
              'tfidf':[c for c in df.columns if c.startswith('tfidf')]}

AUC_score = make_scorer(AUC, greater_is_better=True, needs_proba = True)
scoring = {'AUC': AUC_score}


target_cols = 'vb200_vf2_volatility_VWAP_200'
i = 0
for feats_key, feats_col in feats_dict3.items():                

    i+=1

    X, y = get_data(df, feats_col, target_cols)

    #XGBOOST()

    cv = cv_generator(2)
#     fit_params={"early_stopping_rounds":5, 
#                 "eval_metric" : "mlogloss", 
#                 "eval_set" : [(X[2758:], y[2758:])],
#                 "verbose":5}

    gs = GridSearchCV(XGBClassifier(objective= 'multi:softprob'),
                      param_grid={
                                  'learning_rate':[0.3],
                                 },
                      scoring= scoring,
#                       fit_params = fit_params,
                      cv = cv,
                      n_jobs=10,
                      verbose=1,
                      refit= 'AUC') #


    gs.fit(X, y)
    output_dict3['model_{}'.format(i)] = {'target':target_cols, 'feats':feats_key, 'GridObject':gs}

     
        
        

### result_1

In [8]:
output_3 = pd.DataFrame.from_dict(output_dict3, orient='index')
# train['word_count'] = train['content'].apply(lambda x: len(str(x).split(" ")))
output_3['para']=output_3['GridObject'].apply(lambda x: x.best_params_ )
output_3['score']=output_3['GridObject'].apply(lambda x: x.best_score_ )

output_3


Unnamed: 0,target,feats,GridObject,para,score
model_1,vb200_vf2_volatility_VWAP_200,gram,GridSearchCV(cv=<generator object cv_generator...,{'learning_rate': 0.3},0.495657
model_2,vb200_vf2_volatility_VWAP_200,pos,GridSearchCV(cv=<generator object cv_generator...,{'learning_rate': 0.3},0.496523
model_3,vb200_vf2_volatility_VWAP_200,title_tfidf,GridSearchCV(cv=<generator object cv_generator...,{'learning_rate': 0.3},0.488169
model_4,vb200_vf2_volatility_VWAP_200,source_bow,GridSearchCV(cv=<generator object cv_generator...,{'learning_rate': 0.3},0.501416
model_5,vb200_vf2_volatility_VWAP_200,Pre_Doc2Vec,GridSearchCV(cv=<generator object cv_generator...,{'learning_rate': 0.3},0.491428
model_6,vb200_vf2_volatility_VWAP_200,pol_sub,GridSearchCV(cv=<generator object cv_generator...,{'learning_rate': 0.3},0.513597
model_7,vb200_vf2_volatility_VWAP_200,tfidf,GridSearchCV(cv=<generator object cv_generator...,{'learning_rate': 0.3},0.516747


## hyper-parameter, many nlp

In [3]:
# tfidf+pol_sub+source+pos+gram+Doc2Vec+title
feats_com = [tfidf,
             tfidf+pol_sub,
             tfidf+pol_sub+source,
             tfidf+pol_sub+source+pos+gram,
             tfidf+pol_sub+source+pos+gram+Doc2Vec+title]

output_dict3={}

AUC_score = make_scorer(AUC, greater_is_better=True, needs_proba = True)
_precision_score = make_scorer(precision, greater_is_better=True, needs_proba = True)
_recall_score = make_scorer(recall, greater_is_better=True, needs_proba = True)
_accuracy_score = make_scorer(accuracy, greater_is_better=True, needs_proba = True)

scoring = {'AUC': AUC_score,'precision':_precision_score,'recall':_recall_score,'accuracy':_accuracy_score}

learning_rate = [[1.8],[1.0],[1.8],[2.4],[1.6]]
n_estimators = [[1000],[600],[1150],[1200],[850]]
max_depth = [[4],[7],[4],[4],[3]]
min_child_weight = [[2],[3],[2],[1],[5]]
gamma = [[0],[0],[0.2],[0.1],[0.2]]
subsample = [[1],[1],[0.9],[0.6],[1]]
colsample_bytree = [[1],[1],[1],[0.6],[1]]
reg_alpha = [[0],[1e-5],[0],[0],[0]]

target_cols = 'vb200_vf2_volatility_VWAP_200'
for i in range(5):
#     print(i)
    
    feature = feats_com[i]
    X, y = get_data(df, feature, target_cols)
    cv = cv_generator(1)

    xgb = XGBClassifier(
                         objective= 'multi:softprob',
                        )

#     fit_params={"early_stopping_rounds":5, 
#                 "eval_metric" : "mlogloss", 
#                 "eval_set" : [(X[3370:4150,:], y[3370:4150])],
#                 "verbose":5}
    
    gs = GridSearchCV(xgb,
                      param_grid={
                                  'learning_rate':learning_rate[i],
                                  'n_estimators':n_estimators[i],
                                  'max_depth':max_depth[i],
                                  'min_child_weight':min_child_weight[i],
                                  'gamma':gamma[i],
                                  'subsample':subsample[i],
                                  'colsample_bytree':colsample_bytree[i],
                                  'reg_alpha':reg_alpha[i]
                                 },
                      scoring= scoring,
                      cv = cv,
                      n_jobs=10,
                      verbose=1,
#                       fit_params = fit_params,
                      refit='AUC')


    gs.fit(X[:2758], y[:2758])
    output_dict3['model_{}'.format(i+1)] = {'GridObject':gs}


### results

In [7]:
output_3 = pd.DataFrame.from_dict(output_dict3, orient='index')
# train['word_count'] = train['content'].apply(lambda x: len(str(x).split(" ")))
output_3['para']=output_3['GridObject'].apply(lambda x: x.best_params_ )
output_3['score']=output_3['GridObject'].apply(lambda x: x.best_score_ )
output_3['Accuracy']=output_3['GridObject'].apply(lambda x: x.cv_results_['mean_test_accuracy'][0])
output_3['Recall']=output_3['GridObject'].apply(lambda x: x.cv_results_['mean_test_recall'][0])
output_3['precision']=output_3['GridObject'].apply(lambda x: x.cv_results_['mean_test_precision'][0])
output_3



Unnamed: 0,GridObject,para,score,Accuracy,Recall,precision
model_1,GridSearchCV(cv=<generator object cv_generator...,"{'colsample_bytree': 1, 'gamma': 0, 'learning_...",0.560721,0.583333,0.404167,0.442489
model_2,GridSearchCV(cv=<generator object cv_generator...,"{'colsample_bytree': 1, 'gamma': 0, 'learning_...",0.563082,0.569969,0.428526,0.454649
model_3,GridSearchCV(cv=<generator object cv_generator...,"{'colsample_bytree': 1, 'gamma': 0.2, 'learnin...",0.510453,0.556604,0.390865,0.406705
model_4,GridSearchCV(cv=<generator object cv_generator...,"{'colsample_bytree': 0.6, 'gamma': 0.1, 'learn...",0.543298,0.563679,0.438141,0.417416
model_5,GridSearchCV(cv=<generator object cv_generator...,"{'colsample_bytree': 1, 'gamma': 0.2, 'learnin...",0.528495,0.551887,0.452885,0.427123


In [8]:

models = []
for i in range(5):
    models.append(output_3['GridObject'].iloc[i].best_estimator_)
models

[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=1.8, max_delta_step=0,
        max_depth=4, min_child_weight=2, missing=None, n_estimators=1000,
        n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=1.0, max_delta_step=0,
        max_depth=7, min_child_weight=3, missing=None, n_estimators=600,
        n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
        reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0.2, learning_rate=1.8, max_delta_step=0,
        max_depth=4, min_child_weight=2, missing=None, n_esti

In [38]:
output_3['para'].tolist()

[{'colsample_bytree': 1,
  'gamma': 0,
  'learning_rate': 1.8,
  'max_depth': 4,
  'min_child_weight': 2,
  'n_estimators': 1000,
  'reg_alpha': 0,
  'subsample': 1},
 {'colsample_bytree': 1,
  'gamma': 0,
  'learning_rate': 1.0,
  'max_depth': 7,
  'min_child_weight': 3,
  'n_estimators': 600,
  'reg_alpha': 1e-05,
  'subsample': 1},
 {'colsample_bytree': 1,
  'gamma': 0.2,
  'learning_rate': 1.8,
  'max_depth': 4,
  'min_child_weight': 2,
  'n_estimators': 1150,
  'reg_alpha': 0,
  'subsample': 0.9},
 {'colsample_bytree': 0.6,
  'gamma': 0.1,
  'learning_rate': 2.4,
  'max_depth': 4,
  'min_child_weight': 1,
  'n_estimators': 1200,
  'reg_alpha': 0,
  'subsample': 0.6},
 {'colsample_bytree': 1,
  'gamma': 0.2,
  'learning_rate': 1.6,
  'max_depth': 3,
  'min_child_weight': 5,
  'n_estimators': 850,
  'reg_alpha': 0,
  'subsample': 1}]

### testing result -- plot

In [4]:
for i in range(5):
#     gs = output_3['GridObject'].iloc[i]
#     best = gs.best_estimator_
#   print(best)
    best = models[i]
    feats = feats_com[i]
    X,y = get_data(df, feats, 'vb200_vf2_volatility_VWAP_200')
    best.fit(X[2758],y[:2758],sample_weight = df['sample_weights_vb200_vf2_volatility_VWAP_200'][:2758])
#   print('AUC_{}'.format(i+1), AUC(y[-818:],best.predict_proba(X[-818:])))
    plot_AUC(y[2758:],best.predict_proba(X[2758:]),'model{}'.format(i+1))
#     print(best.predict_proba(X[4151:]))
  
#   gs.cv_results_['mean_train_AUC']

In [0]:
for c in ['target_vb50_vf2_volatility_VWAP_200','target_vb50_vf3_volatility_VWAP_200','target_vb100_vf2_volatility_VWAP_200','target_vb100_vf3_volatility_VWAP_200','target_vb200_vf2_volatility_VWAP_200','target_vb200_vf3_volatility_VWAP_200']:
    print(df[[c, c.replace('target', 'futureReturn')]].groupby(c).describe()[c.replace('target', 'futureReturn')][['count', 'mean']])

                                      count      mean
target_vb50_vf2_volatility_VWAP_200                  
-1.0                                  565.0 -0.003367
 0.0                                 4052.0 -0.000021
 1.0                                  576.0  0.003335
                                      count      mean
target_vb50_vf3_volatility_VWAP_200                  
-1.0                                  244.0 -0.004437
 0.0                                 4719.0 -0.000021
 1.0                                  230.0  0.004473
                                       count      mean
target_vb100_vf2_volatility_VWAP_200                  
-1.0                                  1007.0 -0.003694
 0.0                                  3201.0 -0.000003
 1.0                                   985.0  0.003747
                                       count      mean
target_vb100_vf3_volatility_VWAP_200                  
-1.0                                   539.0 -0.004896
 0.0                

### Dataframe

In [5]:
# gs = output_3['GridObject'].iloc[0]
# best = gs.best_estimator_
best = models[0]
feats = feats_com[0]
X,y = get_data(df, feats, 'vb200_vf2_volatility_VWAP_200')
best.fit(X[:2758],y[:2758],sample_weight = df['sample_weights_vb200_vf2_volatility_VWAP_200'][:2758])
prediction_df = pd.DataFrame(best.predict_proba(X[2758:]),columns = ['idel_1','buy_1','sell_1']).reset_index()

for i in range(1,5):
#     gs = output_3['GridObject'].iloc[i]
#     best = gs.best_estimator_
    best = models[i]
    feats = feats_com[i]
    X,y = get_data(df, feats, 'vb100_vf2_volatility_VWAP_200')
    best.fit(X[:2758],y[:2758],sample_weight = df['sample_weights_vb200_vf2_volatility_VWAP_200'][:2758])
    
    predictions = pd.DataFrame(best.predict_proba(X[2758:]),
                                 columns = ['idel_{}'.format(i+1),'buy_{}'.format(i+1),'sell_{}'.format(i+1)])
    prediction_df = pd.merge(prediction_df,predictions,left_index=True,right_index= True)
    
true = pd.DataFrame(y[2758:],columns = ['true_value'])
prediction_df['true_value'] = true['true_value'] #idle
prediction_df

In [17]:
prediction_df.to_pickle('prediction_df_3.pkl')

### accuracy

In [6]:
auc_score_list = []
accuracy_list = []
for i in range(5):
    gs = output_3['GridObject'].iloc[i]
    best = gs.best_estimator_
#   print(best)
    feats = feats_com[i]
    X,y = get_data(df, feats, 'vb200_vf2_volatility_VWAP_200')
#   print('AUC_{}'.format(i+1), AUC(y[-818:],best.predict_proba(X[-818:])))
    accuracy_list.append(accuracy_score(y[4150:],best.predict(X[4150:])))
    auc_score_list.append(AUC(y[4150:],best.predict_proba(X[4150:])))

In [25]:
print(auc_score_list)
print(accuracy_list)

[0.6776208790885867, 0.6441968294488791, 0.6013745535554913, 0.6208417274768036, 0.6457366367256873]
[0.5052732502396932, 0.46308724832214765, 0.4573346116970278, 0.473633748801534, 0.47555129434324067]


# Contains market

In [7]:
## Model conbinations
from xgboost import plot_importance
market_feats = items['all_feats'][:189]
for i in range(5):
#     gs = output_3['GridObject'].iloc[i]
    model = models[i]
#     feats = feats_com[i]
    X,y = get_data(df,market_feats,'vb200_vf2_volatility_VWAP_200')
    model.fit(X[2758:], y[2758:],sample_weight = df['sample_weights_vb200_vf2_volatility_VWAP_200'][:2758])
    features_name = list(df[market_feats].columns.values)
    feature_importance = list(zip(features_name, model.feature_importances_))
    data = pd.DataFrame(feature_importance,columns=['feature','importance'])
    data.to_csv('./Target_Three/market_importance_model{}.csv'.format(i+1))

                

In [8]:
## pick important market features
# accuracy_list = []
# auc_score_list = []
for i in range(5):
#     gs = output_3['GridObject'].iloc[i]
    model = models[i]
    market_feats = pd.read_csv('./Target_Three/market_importance_model{}.csv'.format(i+1))['feature'].tolist()[:20]
    feats = feats_com[i] + market_feats
    X,y = get_data(df, feats, 'vb200_vf2_volatility_VWAP_200')
    model.fit(X[:2758],y[:2758],sample_weight = df['sample_weights_vb200_vf2_volatility_VWAP_200'][:2758])
    y_true = y[2758:]
    y_probas = model.predict_proba(X[2758:])
    y_predict = model.predict(X[2758:])
    
#     plot_AUC(y_true,y_probas,'model{}'.format(i+1))
#     accuracy_list.append(accuracy_score(y_true,y_predict))
#     auc_score_list.append(AUC(y_true,y_probas))
    
    if i == 0:
        prediction_df = pd.DataFrame(y_probas,
                                     columns = ['idel_{}'.format(i+1),'buy_{}'.format(i+1),'sell_{}'.format(i+1)])
    else:
        predictions =  pd.DataFrame(y_probas,
                                     columns = ['idel_{}'.format(i+1),'buy_{}'.format(i+1),'sell_{}'.format(i+1)])
        prediction_df = pd.merge(prediction_df,predictions,left_index=True,right_index= True)

true = pd.DataFrame(y[2758:],columns = ['true_value'])
prediction_df['true_value'] = true['true_value'] #idle
prediction_df.to_pickle('prediction_df_market3.pkl')

In [9]:
market_feats = items['all_feats'][:189]
for i in range(5):
#     gs = output_3['GridObject'].iloc[i]
    model = models[i]
    feats = feats_com[i] + market_feats
    X,y = get_data(df, feats, 'vb200_vf2_volatility_VWAP_200')
    model.fit(X[:2758],y[:2758],sample_weight = df['sample_weights_vb200_vf2_volatility_VWAP_200'][:2758])
    y_true = y[2758:]
    y_probas = model.predict_proba(X[2758:])
    y_predict = model.predict(X[2758:])
    
#     plot_AUC(y_true,y_probas,'model{}'.format(i+1))
#     accuracy_list.append(accuracy_score(y_true,y_predict))
#     auc_score_list.append(AUC(y_true,y_probas))
    
    if i == 0:
        prediction_df = pd.DataFrame(y_probas,
                                     columns = ['idel_{}'.format(i+1),'buy_{}'.format(i+1),'sell_{}'.format(i+1)])
    else:
        predictions =  pd.DataFrame(y_probas,
                                     columns = ['idel_{}'.format(i+1),'buy_{}'.format(i+1),'sell_{}'.format(i+1)])
        prediction_df = pd.merge(prediction_df,predictions,left_index=True,right_index= True)

true = pd.DataFrame(y[2758:],columns = ['true_value'])
prediction_df['true_value'] = true['true_value'] #idle
prediction_df.to_pickle('prediction_df_all3.pkl')

In [12]:
feats_com = [tfidf,
             tfidf+pol_sub,
             tfidf+pol_sub+source,
             tfidf+pol_sub+source+pos+gram,
             tfidf+pol_sub+source+pos+gram+Doc2Vec+title]

from sklearn.dummy import DummyClassifier



stra = "stratified"
feats = feats_com[1]

X,y = get_data(df, feats, 'vb200_vf2_volatility_VWAP_200')
clf = DummyClassifier(strategy=stra, random_state=0)
clf = clf.fit(X[:2758], y[:2758])
y_probas = clf.predict_proba(X[2758:])
predictions1 =  pd.DataFrame(y_probas,
                            columns = 
                            ['idel_nlp','buy_nlp','sell_nlp'])
true = pd.DataFrame(y[2758:],columns = ['true_value'])
predictions1['true_value'] = true['true_value']
predictions1.to_pickle('./Dummy/prediction_dummy_nlp1.pkl')


feats = feats_com[1] + items['all_feats'][:189]
X,y = get_data(df, feats, 'vb200_vf2_volatility_VWAP_200')
clf = DummyClassifier(strategy=stra, random_state=0)
clf = clf.fit(X[:2758], y[:2758])
y_probas = clf.predict_proba(X[2758:])
predictions2 =  pd.DataFrame(y_probas,
                            columns = 
                            ['idel_all','buy_all','sell_all'])
true = pd.DataFrame(y[2758:],columns = ['true_value'])
predictions2['true_value'] = true['true_value']
predictions2.to_pickle('./Dummy/prediction_dummy_all1.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_array(key, value)
