# LGMB with tuned parameters, using 2 models

### Importer les packages

In [None]:
import numpy as np
import lightgbm as lgb
import pandas as pd
from kaggle.competitions import twosigmanews
import matplotlib.pyplot as plt
import random
from datetime import datetime, date
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
import time

### Récupérer les données

In [None]:
env = twosigmanews.make_env()

(market_train_df, news_train_df) = env.get_training_data()
market_train, news_train = market_train_df.copy(), news_train_df.copy()

## Data preprocessing

Dans notre cas le volume des données utilisées influence l'apprentissage de notre modèle, appliquer un dropna() va supprimer un grand nombre de lignes, donc nous allons les remplir

In [None]:
#Fonction pour remplir les données nulles
def remp_val_nul(dataframe):
    for i in dataframe.columns:
        #categorical value
        if dataframe[i].dtype == "object":
            dataframe[i] = dataframe[i].fillna("other")
            #numerical value
        elif (dataframe[i].dtype == "int64" or dataframe[i].dtype == "float64"):
            dataframe[i] = dataframe[i].fillna(dataframe[i].mean())
    return data

In [None]:
market_train_df = remp-val_nul(market_train_df)

Pour pouvoir prendre en considération les assetCodes pour l'apprentissage de notre modèle, il faut les transformer en valeurs numériques discrètes 

In [None]:
#fonction ayant même role que label encoder
def prepro(market_train):
    market_train.time = market_train.time.dt.date
    lbl = {k: v for v, k in enumerate(market_train['assetCode'].unique())}
    market_train['assetCodeT'] = market_train['assetCode'].map(lbl)
    market_train = market_train.dropna(axis=0)
    return market_train

In [None]:
market_train = prepro(market_train_df)

Les données avant 2009 présentent des anomalies, donc nous allons prendre en considération les données dont la date est supérieure que 20009/01/01

In [None]:
market_train = market_train.loc[market_train['time']>=date(2009, 1, 1)]

Dict ayant les colonnes de news dataframe et les fcts d'agg 

In [1]:
news_cols_agg = {
    'urgency': ['min', 'count'],
    'takeSequence': ['max'],
    'bodySize': ['min', 'max', 'mean', 'std'],
    'wordCount': ['min', 'max', 'mean', 'std'],
    'sentenceCount': ['min', 'max', 'mean', 'std'],
    'companyCount': ['min', 'max', 'mean', 'std'],
    'marketCommentary': ['min', 'max', 'mean', 'std'],
    'relevance': ['min', 'max', 'mean', 'std'],
    'sentimentNegative': ['min', 'max', 'mean', 'std'],
    'sentimentNeutral': ['min', 'max', 'mean', 'std'],
    'sentimentPositive': ['min', 'max', 'mean', 'std'],
    'sentimentWordCount': ['min', 'max', 'mean', 'std'],
    'noveltyCount12H': ['min', 'max', 'mean', 'std'],
    'noveltyCount24H': ['min', 'max', 'mean', 'std'],
    'noveltyCount3D': ['min', 'max', 'mean', 'std'],
    'noveltyCount5D': ['min', 'max', 'mean', 'std'],
    'noveltyCount7D': ['min', 'max', 'mean', 'std'],
    'volumeCounts12H': ['min', 'max', 'mean', 'std'],
    'volumeCounts24H': ['min', 'max', 'mean', 'std'],
    'volumeCounts3D': ['min', 'max', 'mean', 'std'],
    'volumeCounts5D': ['min', 'max', 'mean', 'std'],
    'volumeCounts7D': ['min', 'max', 'mean', 'std']
}

Fonction pour joindre les données de Market et News en un deul dataframe

In [None]:
def join_market_news(market_train_df, news_train_df):
    news_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall(f"'([\w\./]+)'")
    assetCodes_expanded = list(chain(*news_train_df['assetCodes']))
    assetCodes_index = news_train_df.index.repeat( news_train_df['assetCodes'].apply(len) )

    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})
    news_cols = ['time', 'assetCodes'] + sorted(news_cols_agg.keys())
    news_train_df_expanded = pd.merge(df_assetCodes, news_train_df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))
    del news_train_df, df_assetCodes
    news_train_df_aggregated = news_train_df_expanded.groupby(['time', 'assetCode']).agg(news_cols_agg)
    del news_train_df_expanded
    news_train_df_aggregated = news_train_df_aggregated.apply(np.float32)
    news_train_df_aggregated.columns = ['_'.join(col).strip() for col in news_train_df_aggregated.columns.values]
    market_train_df = market_train_df.join(news_train_df_aggregated, on=['time', 'assetCode'])
    del news_train_df_aggregated
    
    return market_train_df

In [None]:
#Récupérer X et Y
def get_xy(market_train_df, news_train_df, le=None):
    x, le = get_x(market_train_df, news_train_df)
    y = market_train_df['returnsOpenNextMktres10'].clip(-1, 1)
    return x, y, le


def label_encode(series, min_count):
    vc = series.value_counts()
    le = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return le


def get_x(market_train_df, news_train_df, le=None):
    #En bourse l'ouverture et cloture se font à 22h, pour pouvoir faiire la différence entre les différentes journées, nous alons
    #considérer qu'après 22h une nouvelle journée commence
    news_train_df['time'] = (news_train_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')
    market_train_df['time'] = market_train_df['time'].dt.floor('1D')

    # Join market and news
    x = join_market_news(market_train_df, news_train_df)
    #encode assetCode and assetname
    if le is None:
        le_assetCode = label_encode(x['assetCode'], min_count=10)
        le_assetName = label_encode(x['assetName'], min_count=5)
    else:
        le_assetCode, le_assetName = le  
    x['assetCode'] = x['assetCode'].map(le_assetCode).fillna(-1).astype(int)
    x['assetName'] = x['assetName'].map(le_assetName).fillna(-1).astype(int)
    
    try:
        x.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        pass
    try:
        x.drop(columns=['universe'], inplace=True)
    except:
        pass
    x['dayofweek'], x['month'] = x.time.dt.dayofweek, x.time.dt.month
    x.drop(columns='time', inplace=True)
    for bogus_col in ['marketCommentary_min', 'marketCommentary_max']:
        x[bogus_col] = x[bogus_col].astype(float)
    
    return x, (le_assetCode, le_assetName)

In [None]:
X, y, le = get_xy(market_train_df, news_train_df)

In [None]:
# Scaling of X values
mins = np.min(X, axis=0)
maxs = np.max(X, axis=0)
rng = maxs - mins
X = 1 - ((maxs - X) / rng)


X_train, X_test,  r_train, r_test = model_selection.train_test_split(X,  r, test_size=0.25, random_state=99)

dtrain = lgb.Dataset(X_train.values, y_train, free_raw_data=False)
dvalid = lgb.Dataset(X_valid.values, y_valid, free_raw_data=False)

Training pour deux modèles dont x_1 et x_2 proviennent de deux différents parameters tuning

In [None]:
x_1 = [0.19000424246380565, 2452, 212, 328, 202]
x_2 = [0.19016805202090095, 2583, 213, 312, 220]

params_1 = {
        'task': 'train',
        'boosting_type': 'dart',
        'objective': 'binary',
        'learning_rate': x_1[0],
        'num_leaves': x_1[1],
        'min_data_in_leaf': x_1[2],
        'num_iteration': x_1[3],
        'max_bin': x_1[4],
        'verbose': 1
    }

params_2 = {
        'task': 'train',
        'boosting_type': 'dart',
        'objective': 'binary',
        'learning_rate': x_2[0],
        'num_leaves': x_2[1],
        'min_data_in_leaf': x_2[2],
        'num_iteration': x_2[3],
        'max_bin': x_2[4],
        'verbose': 1
    }


In [None]:
gbm_1 = lgb.train(params_1,
        dtrain,
        num_boost_round=100,
        valid_sets=dvalid,
        early_stopping_rounds=5)
        
gbm_2 = lgb.train(params_2,
        dtrain,
        num_boost_round=100,
        valid_sets=dvalid,
        early_stopping_rounds=5)

Faire la prédiction

In [None]:
def make_predictions(predictions_template_df, market_obs_df, news_obs_df, le):
    x, _ = get_x(market_obs_df, news_obs_df, le)
    predictions_template_df.confidenceValue = np.clip((gbm_1.predict(x) + gbm_2.predict(x))/2, -1, 1)

In [None]:
days = env.get_prediction_days()

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(predictions_template_df, market_obs_df, news_obs_df, le)
    print(predictions_template_df)
    env.predict(predictions_template_df)
print('Done!')

In [None]:

    
env.write_submission_file()
sub  = pd.read_csv("submission.csv")