# Market Data Only baseline
<a href='https://www.kaggle.com/christofhenkel/market-data-nn-baseline'>Based On _</a>

This is a fit of market data only (no news data used ) showing relatively good results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from kaggle.competitions import twosigmanews
import time

env = twosigmanews.make_env()
(market_train, _) = env.get_training_data()

In [None]:
cat_cols = ['assetCode']
num_cols = ['volume', 'close', 'open', 
            'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
            'returnsClosePrevMktres1','returnsOpenPrevMktres1',
            'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
            'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']

train_indices, val_indices = train_test_split(market_train.index.values,test_size=0.25, random_state=23)

## Handling Numerical, Categorical variables

In [None]:
## Handling categorical variables
def encode(encoder, x):
    len_encoder = len(encoder)
    try:
        id = encoder[x]
    except KeyError:
        id = len_encoder
    return id

encoders = [{} for cat in cat_cols]


for i, cat in enumerate(cat_cols):
    print('encoding %s ...' % cat, end=' ')
    encoders[i] = {l: id for id, l in enumerate(market_train.loc[train_indices, cat].astype(str).unique())}
    market_train[cat] = market_train[cat].astype(str).apply(lambda x: encode(encoders[i], x))
    print('Done')

embed_sizes = [len(encoder) + 1 for encoder in encoders] #+1 for possible unknown assets

## Handling numerical variables
market_train[num_cols] = market_train[num_cols].fillna(0)
print('scaling numerical columns')
scaler = StandardScaler()
market_train[num_cols] = scaler.fit_transform(market_train[num_cols])

## Define NN Architecture

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate, Flatten, BatchNormalization
from keras.losses import binary_crossentropy, mse
from keras.optimizers import SGD, Adam, Adagrad, RMSprop

def build_model():
    categorical_inputs = []
    for cat in cat_cols:
        categorical_inputs.append(Input(shape=[1], name=cat))
    
    categorical_embeddings = []
    for i, cat in enumerate(cat_cols):
        categorical_embeddings.append(Embedding(embed_sizes[i], 10)(categorical_inputs[i]))
        
    #categorical_logits = Concatenate()([Flatten()(cat_emb) for cat_emb in categorical_embeddings])
    categorical_logits = Flatten()(categorical_embeddings[0])
    categorical_logits = Dense(32, activation = 'relu')(categorical_logits)
    
    numerical_inputs = Input(shape=(11,), name='num')
    numerical_logits = BatchNormalization()(numerical_inputs)
    
    numerical_logits = Dense(128, activation='relu')(numerical_logits)
    numerical_logits = Dense(64, activation='relu')(numerical_logits)
    
    logits = Concatenate()([numerical_logits, categorical_logits])
    logits = Dense(64, activation='relu')(logits)
    out = Dense(1, activation='sigmoid')(logits)
    
    model = Model(inputs=categorical_inputs + [numerical_inputs], outputs=out)
    model.compile(optimizer=SGD(lr = 0.001), loss=binary_crossentropy)
    return model

build_model().summary()

In [None]:
def get_input(market_train, indices):
    X_num = market_train.loc[indices, num_cols].values
    X = {'num':X_num}
    for cat in cat_cols:
        X[cat] = market_train.loc[indices, cat_cols].values
    y = (market_train.loc[indices,'returnsOpenNextMktres10'] >= 0).values
    r = market_train.loc[indices,'returnsOpenNextMktres10'].values
    u = market_train.loc[indices, 'universe']
    d = market_train.loc[indices, 'time'].dt.date
    return X,y,r,u,d

# r, u and d are used to calculate the scoring metric
X_train,y_train,r_train,u_train,d_train = get_input(market_train, train_indices)
X_valid,y_valid,r_valid,u_valid,d_valid = get_input(market_train, val_indices)

## Train NN model

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
check_point = ModelCheckpoint('model.hdf5', verbose=True, save_best_only=True)
early_stoping = EarlyStopping(patience=5, verbose=True)
model =build_model()
model.fit(X_train, y_train.astype(int),
          validation_data = (X_valid, y_valid.astype(int)),
          epochs = 25,
          verbose = True,
          callbacks = [early_stoping, check_point])

## Evaluation of Validation Set

In [None]:
# distribution of confidence that will be used as submission
model.load_weights('model.hdf5')
confidence_valid = model.predict(X_valid)[:,0]*2 -1
print(accuracy_score(confidence_valid>0,y_valid))
plt.hist(confidence_valid, bins='auto')
plt.title("predicted confidence")
plt.show()

In [None]:
# calculation of actual metric that is used to calculate final score
r_valid = r_valid.clip(-1,1) # get rid of outliers. Where do they come from??
x_t_i = confidence_valid * r_valid * u_valid
data = {'day' : d_valid, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_valid = mean / std
print(score_valid)

## Prediction

In [None]:
days = env.get_prediction_days()

n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
predicted_confidences = np.array([])
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    print(n_days,end=' ')
    
    t = time.time()

    market_obs_df['assetCode_encoded'] = market_obs_df[cat].astype(str).apply(lambda x: encode(encoders[i], x))

    market_obs_df[num_cols] = market_obs_df[num_cols].fillna(0)
    market_obs_df[num_cols] = scaler.transform(market_obs_df[num_cols])
    X_num_test = market_obs_df[num_cols].values
    X_test = {'num':X_num_test}
    X_test['assetCode'] = market_obs_df['assetCode_encoded'].values
    
    prep_time += time.time() - t
    
    t = time.time()
    market_prediction = model.predict(X_test)[:,0]*2 -1
    predicted_confidences = np.concatenate((predicted_confidences, market_prediction))
    prediction_time += time.time() -t
    
    t = time.time()
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':market_prediction})
    # insert predictions to template
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t

env.write_submission_file()
total = prep_time + prediction_time + packaging_time
print(f'Preparing Data: {prep_time:.2f}s')
print(f'Making Predictions: {prediction_time:.2f}s')
print(f'Packing: {packaging_time:.2f}s')
print(f'Total: {total:.2f}s')

In [None]:
# distribution of confidence as a sanity check: they should be distributed as above
plt.hist(predicted_confidences, bins='auto')
plt.title("predicted confidence")
plt.show()

In [None]:
# Next: LSTM, correct train/ test split (making sure it is time ordered)
# ... refere to original kernel (comments)