In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc 
import time

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

%matplotlib inline

In [2]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

Loading the data... This could take a minute.
Done!


In [3]:
def unstack_news_data(news_train_df):
    drop_list = ['sourceTimestamp', 'firstCreated', 'sourceId', 'headline','takeSequence', 'subjects', 'audiences',
             'bodySize', 'marketCommentary']
    news_train_df.drop(drop_list, axis=1, inplace=True)
    news_train_df['assetCodes'] = news_train_df['assetCodes'].apply(lambda x:x[1:-1].replace("'", ""))
    
    assetCodes = []
    indices = []

    for i, val in news_train_df['assetCodes'].iteritems():
        codes = val.split(",")
        assetCodes.extend(codes)
        list_index = [int(i)]*len(codes)
        indices.extend(list_index)

    index_df = pd.DataFrame({'new_index': indices, 'assetCode': assetCodes})
    news_train_df.drop(['assetCodes'], axis=1, inplace=True)
    news_train_df['new_index'] = news_train_df.index.copy()
    news_unstack = index_df.merge(news_train_df, how='left', on='new_index')
    del index_df
    news_unstack.drop(['new_index'], axis=1, inplace=True)
    news_unstack['date'] = news_unstack.time.dt.date
    return news_unstack

def regroup_news(news_unstack):
    group_news = news_unstack.groupby(['assetCode', 'date']).agg(['mean'])
    group_news.columns = pd.Index(["{}_{}".format(i[0], i[1]) for i in group_news.columns.tolist()])
    group_news.reset_index(inplace=True)
    return group_news

def merge_market_news(market_train_df, group_news):
    market_train_df['date'] = market_train_df.time.dt.date
    market_news_df = market_train_df.merge(group_news, how='left', on=['assetCode', 'date'])
    return market_news_df

def get_features(market_news_df):
    features = market_news_df.drop(['time', 'date', 'assetCode', 'assetName'], axis=1)
    features = features.astype('float32')
    features = features.fillna(features.mean())
    return features    

In [11]:
(market_train_df, news_train_df) = env.get_training_data()

In [12]:
start = time.time()
news_unstack = unstack_news_data(news_train_df)
print (time.time() - start)

35.64110493659973


In [13]:
group_news = regroup_news(news_unstack)
del news_unstack
gc.collect()

357

In [14]:
market_news_df = merge_market_news(market_train_df, group_news)
del group_news
gc.collect()

46

In [15]:
features = market_news_df.drop(['returnsOpenNextMktres10', 'time', 'date', 'assetCode', 'assetName'], axis=1)
features = features.astype('float32')
features = features.fillna(features.mean())

In [16]:
import sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, market_news_df.returnsOpenNextMktres10, test_size=0.2, random_state=123)

In [None]:
import lightgbm as lgb

params = {'learning_rate': 0.1, 
         'max_depth': 10, 
         'objective': 'binary', 
         'metric': ['accuracy', 'auc', 'log_loss'], 
         'seed': 123}

start_time = time.time()
lgb_model = lgb.train(params, train_set=lgb.Dataset(X_train, label=y_train), 
                      num_boost_round=1000, valid_sets=[lgb.Dataset(X_train, label=y_train), 
                                                      lgb.Dataset(X_test, label=y_test)], 
                     early_stopping_rounds=100)
print ('The training time using LightGBM is: {}'.format(time.time()-start_time))

In [None]:
days = env.get_prediction_days()

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    news_unstack = unstack_news_data(news_obs_df)
    group_news = regroup_news(news_unstack)
    market_news_df = merge_market_news(market_obs_df, group_news)
    obs_df = market_news_df[market_news_df.assetCode.isin(predictions_template_df.assetCode)]

    del market_obs_df, news_obs_df, group_news, news_unstack, market_news_df

    features = get_features(obs_df)
    preds = lgb_model.predict(features)* 2 - 1
    sub = pd.DataFrame({'assetCode': obs_df['assetCode'], 'confidence': preds})
    predictions_template_df = predictions_template_df.merge(sub, 
                                                            how='left').drop('confidenceValue', 
                                                                             axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})

    env.predict(predictions_template_df)
    del obs_df, predictions_template_df, preds, sub
env.write_submission_file()
