In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import os
import time
from itertools import chain

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb

from kaggle.competitions import twosigmanews


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

(market_train_df, news_train_df) = env.get_training_data()
(market_train_df.shape, news_train_df.shape)

#trim to managable dataset
market_train_df = market_train_df.tail(3_000_000)
news_train_df = news_train_df.tail(6_000_000)

In [None]:
market_train_df.head()

In [None]:
news_train_df.head()

In [None]:
cols = list(news_train_df.columns.values)

news_cols_agg = {key:['min', 'max', 'mean', 'std'] for  key in cols}

In [None]:
news_cols_agg = {
    'sentimentNegative': ['min', 'max', 'mean', 'std'],
    'sentimentNeutral': ['min', 'max', 'mean', 'std'],
    'sentimentPositive': ['min', 'max', 'mean', 'std'],
    'sentimentWordCount': ['min', 'max', 'mean', 'std'],
    'noveltyCount12H': ['min', 'max', 'mean', 'std'],
    'noveltyCount24H': ['min', 'max', 'mean', 'std'],
    'noveltyCount3D': ['min', 'max', 'mean', 'std'],
    'noveltyCount5D': ['min', 'max', 'mean', 'std'],
    'noveltyCount7D': ['min', 'max', 'mean', 'std'],
    'volumeCounts12H': ['min', 'max', 'mean', 'std'],
    'volumeCounts24H': ['min', 'max', 'mean', 'std'],
    'volumeCounts3D': ['min', 'max', 'mean', 'std'],
    'volumeCounts5D': ['min', 'max', 'mean', 'std'],
    'volumeCounts7D': ['min', 'max', 'mean', 'std'],
    'urgency': ['min', 'count'],
    'takeSequence': ['max'],
    'bodySize': ['min', 'max', 'mean', 'std'],
    'wordCount': ['min', 'max', 'mean', 'std'],
    'sentenceCount': ['min', 'max', 'mean', 'std'],
    'companyCount': ['min', 'max', 'mean', 'std'],
    'marketCommentary': ['min', 'max', 'mean', 'std'],
    'relevance': ['min', 'max', 'mean', 'std']
}

news_train_df['assetCodes'] 

In [None]:
def merge_data(market_train_df, news_train_df):

    news_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall(f"'([\w\./]+)'")    
    
    #unpacks the dictionary into an expended list form
    assetCodes_expanded = list(chain(*news_train_df['assetCodes']))
    
    assetCodes_index = news_train_df.index.repeat( news_train_df['assetCodes'].apply(len) )
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})

    # Create expandaded news (will repeat every assetCodes' row)
    news_cols = ['time', 'assetCodes'] + sorted(news_cols_agg.keys())
    news_train_df_expanded = pd.merge(df_assetCodes, news_train_df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))

    # Free memory
    del news_train_df
    del df_assetCodes

    # Aggregate numerical news features
    news_train_df_aggregated = news_train_df_expanded.groupby(['time', 'assetCode']).agg(news_cols_agg)
    
    # Free memory 
    del news_train_df_expanded
    news_train_df_aggregated = news_train_df_aggregated.apply(np.float32)

    # Flat columns
    news_train_df_aggregated.columns = ['_'.join(col).strip() for col in news_train_df_aggregated.columns.values]
    
    market_train_df = market_train_df.join(news_train_df_aggregated, on=['time', 'assetCode'])

    # Free memory
    del news_train_df_aggregated
    
    return market_train_df

In [None]:
def get_returns(market_train_df, news_train_df, lable_encoder=None):
    x, lable_encoder = get_x(market_train_df, news_train_df)
    y = market_train_df['returnsOpenNextMktres10'].clip(-1, 1)
    return x, y, lable_encoder

def label_encode(series, min_count):
    vc = series.value_counts()
    lable_encoder = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return lable_encoder

def get_x(market_train_df, news_train_df, label_encoder=None):
    # Split date into before and after 22h (the time used in train data)
    # E.g: 2007-03-07 23:26:39+00:00 -> 2007-03-08 00:00:00+00:00 (next day)
    #      2009-02-25 21:00:50+00:00 -> 2009-02-25 00:00:00+00:00 (current day)
    news_train_df['time'] = (news_train_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')

    market_train_df['time'] = market_train_df['time'].dt.floor('1D')

    x = merge_data(market_train_df, news_train_df)
    
    if label_encoder is None:
        le_assetCode = label_encode(x['assetCode'], min_count=10)
        le_assetName = label_encode(x['assetName'], min_count=5)
    else:
        le_assetCode, le_assetName = label_encoder
        
    x['assetCode'] = x['assetCode'].map(le_assetCode).fillna(-1).astype(int)
    x['assetName'] = x['assetName'].map(le_assetName).fillna(-1).astype(int)
    
    try:
        x.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        pass
    try:
        x.drop(columns=['universe'], inplace=True)
    except:
        pass
    
    x['dayofweek'], x['month'] = x.time.dt.dayofweek, x.time.dt.month
    x.drop(columns='time', inplace=True)

    # Fix some mixed-type columns
    for tmp in ['marketCommentary_min', 'marketCommentary_max']:
        x[tmp] = x[tmp].astype(float)
    
    del tmp
    #return the lable encoder as a tuple
    return x, (le_assetCode, le_assetName)

In [None]:
X, y, lable_encoder = get_returns(market_train_df, news_train_df)
X.shape, y.shape

In [None]:
universe = market_train_df['universe']
time = market_train_df['time']

# Free memory
del market_train_df, news_train_df

print(X.shape, y.shape)

X.tail()

In [None]:
#80 percent for training
training_set = int(X.shape[0] * 0.8)

X_train, y_train = X.iloc[:training_set], y.iloc[:training_set]
X_valid, y_valid = X.iloc[training_set:], y.iloc[training_set:]

# keep only those in the universe as rules
univ = (universe.iloc[training_set:] > 0)
valid_time = time.iloc[training_set:]

X_valid = X_valid[univ]
y_valid = y_valid[univ]
valid_time = valid_time[univ]
del univ

# Creat lgb datasets
train_cols = X.columns.tolist()
categorical_cols = [] 

train_dataset = lgb.Dataset(X_train.values, y_train, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)
valid_dataset = lgb.Dataset(X_valid.values, y_valid, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)

valid_dataset.params = {
    'extra_time': valid_time.factorize()[0]
}

#  Build Model

In [None]:
lgb_params = dict(
    objective = 'regression_l1',
    learning_rate = 0.1,
    num_leaves = 127,
    max_depth = -1,
    bagging_fraction = 0.75,
    bagging_freq = 2,
    feature_fraction = 0.5,
    lambda_l1 = 0.0,
    lambda_l2 = 1.0,
    seed = 71,
    metric = 'None'
)

'''
Custom Scoring metric

@predictions = np array of predictions
@valid_dataset = dataset to do validation testing
'''
def sig_score(predictions, valid_dataset):
    df_time = valid_dataset.params['extra_time']
    labels = valid_dataset.get_label()
    results = predictions * labels
    results_sum = results.groupby(df_time).sum()
    score = results_sum.mean() / results_sum.std()
    return 'sigma_score', score, True

evals_result = {}
model = lgb.train(lgb_params, train_dataset, num_boost_round=1000, valid_sets=(valid_dataset,), valid_names=('valid',), verbose_eval=25,
              early_stopping_rounds=100, feval=sig_score, evals_result=evals_result)

df_result = pd.DataFrame(evals_result['valid'])

# Plot the Results

### Its learning rather quickly and then plateus 

In [None]:
plot = df_result.plot(figsize=(12, 8))
plot.scatter(df_result['sigma_score'].idxmax(), df_result['sigma_score'].max(), marker='+', color='red')
print(plot)

In [None]:
num_boost_round, valid_score = df_result['sigma_score'].idxmax()+1, df_result['sigma_score'].max()
print(lgb_params)
print(f'Best score was {valid_score:.5f} on round {num_boost_round}')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 14))
lgb.plot_importance(model, ax=ax[0])
lgb.plot_importance(model, ax=ax[1], importance_type='gain')
fig.tight_layout()

#clear space
del model

In [None]:
# Train full model
# train_dataset = lgb.Dataset(X, y, feature_name=train_cols, categorical_feature=categorical_cols)

model = lgb.train(lgb_params, train_dataset, num_boost_round=num_boost_round)

#use model for predictions
def make_predictions(predictions_template_df, market_obs_df, news_obs_df, label_encoder):
    x = get_x(market_obs_df, news_obs_df, lable_encoder)[0]
    predictions_template_df.confidenceValue = np.clip(model.predict(x), -1, 1)

    
days = env.get_prediction_days()

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(predictions_template_df, market_obs_df, news_obs_df, lable_encoder)
    env.predict(predictions_template_df)

env.write_submission_file()