# Stock Market Prediction - Starter Kernel
### Created by Magichanics


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error # wouldn't recommend since we're not being evaluated on MSE
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datetime import datetime
import gc

### Importing Dataframes

In [2]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

Loading the data... This could take a minute.
Done!
Done!


In [3]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
# decide the length of the dataset
# note if sampling, len(news_train_df) > len(market_train_df)
market_train_df = market_train_df.tail(100_000)
news_train_df = news_train_df.tail(300_000)

In [None]:
market_train_df.head()

In [None]:
news_train_df.head()

### Information on the Training Data
* There are no Unknown ``assetName`` in ``news_train_df``, but there are 24 479 rows with Unknown as the ``assetName`` in ``market_train_df``. Merging by ``assetCode`` leaves out Unknown rows, which could be problematic.
* ``Volume`` has the highest correlation in terms of ``returnsOpenNextMktres10``
* Merging by just ``assetCodes`` greatly increases the dataframe (with just 100k rows, it has turned into 10 million rows), although merging by ``assetCodes`` and ``time`` greatly decrease the original dataframe.

### Joining Market & News Data

In [4]:
#tempcode
# decide the length of the dataset
# note if sampling, len(news_train_df) > len(market_train_df)
(market_train_df, news_train_df) = env.get_training_data()
market_train_df = market_train_df.tail(100_000)
news_train_df = news_train_df.tail(300_000)

In [5]:
def join_market_news(market_df, news_df):
    
    # ERROR: REDUCES LENGTH OF DATAFRAME HERE
    print('market_df :' + str(market_df.shape))
    
    # Fix asset codes (str -> list)
    news_df['assetCodes'] = news_df['assetCodes'].str.findall(f"'([\w\./]+)'")

    # Expand assetCodes
    assetCodes_expanded = list(chain(*news_df['assetCodes']))
    assetCodes_index = news_df.index.repeat( news_df['assetCodes'].apply(len) )

    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})
    
    # create dataframe based on groupby
    news_df_expanded = pd.merge(df_assetCodes, news_df, left_on='level_0', right_index=True, suffixes=(['','_old']))
    
#     # create dict of values based on columns
#     str_cols = ['time', 'assetCode', 'headline']
#     news_data_dict = {}
#     news_groupby = news_df_expanded.groupby(['time', 'assetCode'])
#     for f in str_cols:
#         news_data_dict[f] = news_groupby[f].apply(lambda x: x)
#         print('finished ' + f)
    
#     news_df_grouped = pd.DataFrame(news_data_dict)
#     #return news_df_grouped
    
    #X = market_train_df.join(news_df_grouped, on=['time', 'assetCode'])
    #X = pd.merge(news_df_grouped, market_df, on=['time', 'assetCode'])
    print('merging data')
    
    # store unknown rows in a separate dataframe.
    market_unknown = market_df[market_df.assetName == 'Unknown']
    
    # merge both dataframes
    X = pd.merge(news_df_expanded, market_df, on=['assetCode'])
    X = pd.concat([X, market_unknown], axis=1, sort=False)
    
    # cleanup time
    del market_df
    del news_df
    gc.collect()
    
    print('X shape :' + str(X.shape))
    
    return X

X_train = join_market_news(market_train_df, news_train_df)

# headlines                                          wordCount
# 0     ((2016-12-29 11:30:02+00:00, GPT), [GRAMERCY P...           ((2016-12-29 11:30:02+00:00, GPT), [15])
# 1     ((2016-12-29 11:30:02+00:00, GPT.N), [GRAMERCY...         ((2016-12-29 11:30:02+00:00, GPT.N), [15])
# 2     ((2016-12-29 11:30:07+00:00, GPT), [GRAMERCY P...           ((2016-12-29 11:30:07+00:00, GPT), [10])
# 3     ((2016-12-29 11:30:07+00:00, GPT.N), [GRAMERCY...         ((2016-12-29 11:30:07+00:00, GPT.N), [10])
# 4     ((2016-12-29 11:30:49+00:00, HYG.TO), [HYDROGE...         ((2016-12-29 11:30:49+00:00, HYG.TO), [9])
# 5     ((2016-12-29 11:30:49+00:00, HYGS.O), [HYDROGE...         ((2016-12-29 11:30:49+00:00, HYGS.O), [9])
# 6     ((2016-12-29 11:30:49+00:00, HYGS.OQ), [HYDROG...        ((2016-12-29 11:30:49+00:00, HYGS.OQ), [9])

market_df :(100000, 16)
merging data
X shape :(10925926, 68)


### Aggregations

In [None]:
def aggregations(df):
    
    # get columns
    lst_of_lst_of_cols = [[f for f in df.columns if 'return' in f and f != 'returnsOpenNextMktres10'],
                              [f for f in df.columns if 'novelty' in f],
                              [f for f in df.columns if 'volume' in f],
                              [f for f in df.columns if 'sentiment' in f]]
    
    agg_suffixes = ['aggReturn ', 'aggNovelty ', 'aggVolume ', 'aggSentiment ']
    
    for i_cols in range(len(lst_of_lst_of_cols)):
        
        # setup map of aggregations
        agg_dict = {}
        for col in lst_of_lst_of_cols[i_cols]:
            agg_dict[col] = ['mean', 'var', 'sum', 'std', 'max', 'min']
            
        # preform aggregations
        df_agg = df.groupby('sourceId').agg(agg_dict)
        df_agg.columns = pd.Index(['agg_' + e[0] + "_" + e[1].lower() for e in df_agg.columns.tolist()])
        
        # clean up dataframe and merge
        df = df.join(df_agg, how = 'left', on = 'sourceId', lsuffix = agg_suffixes[i_cols])
        del df_agg
        gc.collect()
        
        print('finished ' + agg_suffixes[i_cols])
        
    print('New dataframe shape: ' + str(df.shape))
    return df

X_train = aggregations(X_train)

### Text Processing with MultinomialNB (Headlines)

In [None]:
def get_headline(headlines_df):
    
    # get headlines as list
    headlines_lst = []
    for row in range(0,len(headlines_df.index)):
        headlines_lst.append(headlines_df.iloc[row])

    # split headlines to separate words
    basicvectorizer = CountVectorizer()
    headlines_vectorized = basicvectorizer.fit_transform(headlines_lst)
    
    print(headlines_vectorized.shape)
    return headlines_vectorized, basicvectorizer

def headline_mapping(target, headlines_vectored, headline_vectorizer):
    
    # round target values if using logistic regression
    target = round(target,0)
    
    # get model (testing with model that isn't )
    from sklearn.naive_bayes import MultinomialNB
    headline_model = MultinomialNB()
    headline_model = headline_model.fit(headlines_vectored, target)
    
    # get coefficients
    basicwords = headline_vectorizer.get_feature_names()
    basiccoeffs = headline_model.coef_.tolist()[0]
    coeff_df = pd.DataFrame({'Word' : basicwords, 
                            'Coefficient' : basiccoeffs})
    
    # convert dataframe to dictionary of coefficients
    coefficient_dict = dict(zip(coeff_df.Word, coeff_df.Coefficient))

    return coefficient_dict, coeff_df['Coefficient'].mean()

def get_coeff_col(headlines_df, coeff_dict, coeff_default):
    
    def get_coeff(word_lst):
        
        # iter through every word
        coeff_sum = 0
        for word in word_lst:
            if word in coeff_dict:
                coeff_sum += coeff_dict[word]
            else:
                coeff_sum += coeff_default
        
        # get average coefficient
        return coeff_sum / len(word_lst)
        
    basicvectorizer = CountVectorizer()
    
    # loop through every item
    headlines_coeff_lst = []
    for row in range(0,len(headlines_df.index)):
        headlines_coeff_lst.append(get_coeff(str(headlines_df.iloc[row]).split(' ')))
    
    return pd.Series(headlines_coeff_lst)

coefficient_dict, coefficient_default = headline_mapping(X_train['returnsOpenNextMktres10'],
                                            *get_headline(X_train['headline']))

X_train['headline_coeff_mean'] = get_coeff_col(X_train['headline'], coefficient_dict, coefficient_default)

In [None]:
X_train['headline_coeff_mean'].head()

### Merge Dataframes

### Get Time Features

In [None]:
# ripped from my previous kernel, NYC Taxi Fare

# first get dates
def split_time(df):
    
    # convert to string (will find a more efficient way to do this without converting to string)
    df['time'] = df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
    
    # split date_time into categories
    df['time_day'] = df['time'].str.slice(8,10)
    df['time_month'] = df['time'].str.slice(5,7)
    df['time_year'] = df['time'].str.slice(0,4)
    df['time_hour'] = df['time'].str.slice(11,13)
    
    # source: https://www.kaggle.com/nicapotato/taxi-rides-time-analysis-and-oof-lgbm
    df['temp_time'] = df['time'].str.replace(" UTC", "")
    df['temp_time'] = pd.to_datetime(df['temp_time'], format='%Y-%m-%d %H:%M:%S')
    
    df['time_day_of_year'] = df.temp_time.dt.dayofyear
    df['time_week_of_year'] = df.temp_time.dt.weekofyear
    df["time_weekday"] = df.temp_time.dt.weekday
    df["time_quarter"] = df.temp_time.dt.quarter
    
    del df['temp_time']
    gc.collect()
    
    # convert to non-object columns
    time_feats = ['time_day', 'time_month', 'time_year', 'time_hour']
    df[time_feats] = df[time_feats].apply(pd.to_numeric)
    
    # determine whether the day is set on a holiday
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start='2007-01-01', end='2018-09-27').to_pydatetime()
    df['on_holiday'] = df['time'].str.slice(0,10).apply(lambda x: 1 if x in holidays else 0)
    
    # note to self: encode time later on
    
    return df

X_train = split_time(X_train)

In [None]:
def get_misc_features(X_df):
    
    # Adding daily difference
    new_col = X_df["close"] - X_df["open"]
    X_df.insert(loc=6, column="daily_diff", value=new_col)
    X_df['close_to_open'] =  np.abs(X_df['close'] / X_df['open'])

### Label Encoding

In [None]:
def group_delete(df, del_features):
    for f in del_features:
        del df[f]

def encoding(df, categorical_feats):
    df_encoded = pd.get_dummies(df[categorical_feats])
    df.join(df_encoded, how = 'right')
    group_delete(df, categorical_feats)
    print('new shape: ' + str(df.shape))
    return df

group_delete(X_train, ['time', 'sourceId', 'headline', 'assetCodes'])
X_train = encoding(X_train, [f for f in X_train.columns if X_train[f].dtype == 'object'])

### Cleaning Data

In [None]:
# will use a more efficient way later on
fcol = [c for c in X_train.columns if c not in ['sourceTimestamp', 'firstCreated', 'returnsOpenNextMktres10', 
                                                'assetName_x', 'universe', 'provider', 'subjects',
                                               'audiences', 'marketCommentary', 'assetName_y', 'sourceTimestamp'
                                               'firstCreated']] #<---- added


### Using LGBM for Modelling

In [None]:
# prepare x dataframes for modelling/prediction
def convert_to_X(market_obs_df, news_obs_df):
    
    # this repeats everything that was done previously
    X_test = join_market_news(market_obs_df, news_obs_df)
    X_test = aggregations(X_test)
    X_test['headline_coeff_mean'] = get_coeff_col(X_test['headline'], coefficient_dict, coefficient_default)
    X_test = split_time(X_test)
    group_delete(X_test, ['time', 'sourceId', 'headline', 'assetCodes'])
    X_test = encoding(X_test, ['assetCode', 'headlineTag'])
    X_test = X_test[[f for f in X_test.columns if 'int' in str(X_test[f].dtype) or 'float' in str(X_test[f].dtype)]]
    
    return X_test

In [None]:
y_train = X_train['returnsOpenNextMktres10']
del X_train['returnsOpenNextMktres10']

In [None]:
import lightgbm as lgb
import time

# set model and parameters
params = {'learning_rate': 0.02, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'seed': 2018}

In [None]:
#split data (for cross validation)
x1, x2, y1, y2 = train_test_split(X_train[fcol], 
                                  y_train, 
                                  test_size=0.25, 
                                  random_state=99)

In [None]:
# train
t = time.time()
print('Fitting Up')

# cross validation
lgb_model = lgb.train(params, 
                        lgb.Dataset(x1, label=y1), 
                        5000, 
                        lgb.Dataset(x2, label=y2), 
                        verbose_eval=100, 
                        early_stopping_rounds=200)

# lgb_model = lgb.train(params, 
#                         lgb.Dataset(X_train[fcol], label=y_train),
#                         verbose_eval=100)

print(f'Done, time = {time.time() - t}')

In [None]:
def make_predictions(market_obs_df, news_obs_df):
    
    print('market_obs_df shape: ' + str(market_obs_df.shape))
    print('news_obs_df shape: ' + str(news_obs_df.shape))
    
    # predict using given model
    X_test = convert_to_X(market_obs_df, news_obs_df)
    print('Created X_test with features: ' + str(X_test[fcol].columns))
    
    # there is an error:
    # ValueError: Length of values does not match length of index
    prediction_values = np.clip(lgb_model.predict(X_test[fcol]), -1, 1)
    
    print('finished predictions')

    return prediction_values

### Making Predictions

Now the difference between the training and test data would be these two columns,  ``['returnsOpenNextMktres10', 'universe']``. We will be trying to predict ``returnsOpenNextMktres10`` and using that as the ``confidenceValue``.

In [None]:
for (market_obs_df, news_obs_df, predictions_template_df) in env.get_prediction_days(): # Looping over days from start of 2017 to 2019-07-15
    
    print('predictions_template_df shape: ' + str(predictions_template_df.shape))
    # make predictions
    predictions_template_df['confidenceValue'] = make_predictions(market_obs_df, news_obs_df)
    
    # save predictions
    env.predict(predictions_template_df)


### Export Submission

In [None]:
env.write_submission_file() # Writes your submission file
print('finished!')

### Sources:
* [Getting Started - DJ Sterling](https://www.kaggle.com/dster/two-sigma-news-official-getting-started-kernel)
* [Bare bones script - William Cukierski](https://www.kaggle.com/wcukierski/bare-bones-script-loop-with-comments)
* [Extra data - aaron7sun](https://www.kaggle.com/aaron7sun/stocknews)
* [Text Preprocessing - Andrew Gelé](https://www.kaggle.com/ndrewgele/omg-nlp-with-the-djia-and-reddit)
* [fake news - SamLloyd](https://www.kaggle.com/sjdlloyd/it-s-fake-news-this-is-top-of-the-leaderboard)
* [a simple model - Bruno G. do Amaral](https://www.kaggle.com/bguberfain/a-simple-model-using-the-market-data)
* [LGBM Model - the1owl](https://www.kaggle.com/the1owl/my-two-sigma-cents-only)