## Example

```python
# Will reduce data load for code test
toy = 1

from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Done!')

(market_train_df, news_train_df) = env.get_training_data()

market_train_df.shape, news_train_df.shape

# We will reduce the number of samples for memory reasons
if toy:
    market_train_df = market_train_df.tail(100_000)
    news_train_df = news_train_df.tail(300_000)
else:
    market_train_df = market_train_df.tail(3_000_000)
    news_train_df = news_train_df.tail(6_000_000)

import lightgbm as lgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import chain

%matplotlib inline

news_cols_agg = {
    'urgency': ['min', 'count'],
    'takeSequence': ['max'],
    'bodySize': ['min', 'max', 'mean', 'std'],
    'wordCount': ['min', 'max', 'mean', 'std'],
    'sentenceCount': ['min', 'max', 'mean', 'std'],
    'companyCount': ['min', 'max', 'mean', 'std'],
    'marketCommentary': ['min', 'max', 'mean', 'std'],
    'relevance': ['min', 'max', 'mean', 'std'],
    'sentimentNegative': ['min', 'max', 'mean', 'std'],
    'sentimentNeutral': ['min', 'max', 'mean', 'std'],
    'sentimentPositive': ['min', 'max', 'mean', 'std'],
    'sentimentWordCount': ['min', 'max', 'mean', 'std'],
    'noveltyCount12H': ['min', 'max', 'mean', 'std'],
    'noveltyCount24H': ['min', 'max', 'mean', 'std'],
    'noveltyCount3D': ['min', 'max', 'mean', 'std'],
    'noveltyCount5D': ['min', 'max', 'mean', 'std'],
    'noveltyCount7D': ['min', 'max', 'mean', 'std'],
    'volumeCounts12H': ['min', 'max', 'mean', 'std'],
    'volumeCounts24H': ['min', 'max', 'mean', 'std'],
    'volumeCounts3D': ['min', 'max', 'mean', 'std'],
    'volumeCounts5D': ['min', 'max', 'mean', 'std'],
    'volumeCounts7D': ['min', 'max', 'mean', 'std']
}
```

```python

le = None
# Split date into before and after 22h (the time used in train data)
# E.g: 2007-03-07 23:26:39+00:00 -> 2007-03-08 00:00:00+00:00 (next day)
#      2009-02-25 21:00:50+00:00 -> 2009-02-25 00:00:00+00:00 (current day)
news_train_df['time'] = (news_train_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')

# Round time of market_train_df to 0h of curret day
market_train_df['time'] = market_train_df['time'].dt.floor('1D')

    # Join market and news
    
    # Fix asset codes (str -> list)
news_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall(f"'([\w\./]+)'")    

# Expand assetCodes
assetCodes_expanded = list(chain(*news_train_df['assetCodes']))
assetCodes_index = news_train_df.index.repeat( news_train_df['assetCodes'].apply(len) )

assert len(assetCodes_index) == len(assetCodes_expanded)
df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})

# Create expandaded news (will repeat every assetCodes' row)
news_cols = ['time', 'assetCodes'] + sorted(news_cols_agg.keys())
news_train_df_expanded = pd.merge(df_assetCodes, news_train_df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))

# Free memory
del news_train_df, df_assetCodes



# Aggregate numerical news features
news_train_df_aggregated = news_train_df_expanded.groupby(['time', 'assetCode']).agg(news_cols_agg)

# Free memory
del news_train_df_expanded

# Convert to float32 to save memory
news_train_df_aggregated = news_train_df_aggregated.apply(np.float32)

# Flat columns
news_train_df_aggregated.columns = ['_'.join(col).strip() for col in news_train_df_aggregated.columns.values]

news_train_df_aggregated.head(1)

# Join with train
market_train_df = market_train_df.join(news_train_df_aggregated, on=['time', 'assetCode'])

# Free memory
del news_train_df_aggregated

#return market_train_df
x = market_train_df.copy()
#del market_train_df

# If not label-encoder... encode assetCode
if le is None:
    # Get those assetCodes that have more than 10 instances in the market_train_df
    # And assign them a number (i)
    series = x['assetCode']
    min_count = 10
    vc = series.value_counts()
    le_assetCode = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    # Same thing with the assetNames 
    series = x['assetName']
    min_count = 5
    vc = series.value_counts()
    le_assetName = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
else:
    # 'unpack' label encoders
    le_assetCode, le_assetName = le
    

# MAP asset codes and names to their attributed number and if NAN then fill with -1
x['assetCode'] = x['assetCode'].map(le_assetCode).fillna(-1).astype(int)
x['assetName'] = x['assetName'].map(le_assetName).fillna(-1).astype(int)



try:
    x.drop(columns=['returnsOpenNextMktres10'], inplace=True)
except:
    pass
try:
    x.drop(columns=['universe'], inplace=True)
except:
    pass



x['dayofweek'], x['month'] = x.time.dt.dayofweek, x.time.dt.month
x.drop(columns='time', inplace=True)
#    x.fillna(-1000,inplace=True)





# Fix some mixed-type columns
for bogus_col in ['marketCommentary_min', 'marketCommentary_max']:
    x[bogus_col] = x[bogus_col].astype(float)

# return x, (le_assetCode, le_assetName)

X = x.copy()
del x

le = (le_assetCode, le_assetName)

y = market_train_df['returnsOpenNextMktres10'].clip(-1, 1)
```

```python 

# Save universe data for latter use
universe = market_train_df['universe']
time = market_train_df['time']

# Free memory
del market_train_df

n_train = int(X.shape[0] * 0.8)

X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_valid, y_valid = X.iloc[n_train:], y.iloc[n_train:]

# For valid data, keep only those with universe > 0. This will help calculate the metric
u_valid = (universe.iloc[n_train:] > 0)
t_valid = time.iloc[n_train:]

X_valid = X_valid[u_valid]
y_valid = y_valid[u_valid]
t_valid = t_valid[u_valid]
del u_valid

# Creat lgb datasets
train_cols = X.columns.tolist()
categorical_cols = [] # ['assetCode', 'assetName', 'dayofweek', 'month']

# Note: y data is expected to be a pandas Series, as we will use its group_by function in `sigma_score`
dtrain = lgb.Dataset(X_train.values, y_train, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)
dvalid = lgb.Dataset(X_valid.values, y_valid, feature_name=train_cols, categorical_feature=categorical_cols, free_raw_data=False)

# We will 'inject' an extra parameter in order to have access to df_valid['time'] inside sigma_score without globals
dvalid.params = {
    'extra_time': t_valid.factorize()[0]
}

lgb_params = dict(
    objective = 'regression_l1',
    learning_rate = 0.1,
    num_leaves = 127,
    max_depth = -1,
#     min_data_in_leaf = 1000,
#     min_sum_hessian_in_leaf = 10,
    bagging_fraction = 0.75,
    bagging_freq = 2,
    feature_fraction = 0.5,
    lambda_l1 = 0.0,
    lambda_l2 = 1.0,
    metric = 'None', # This will ignore the loss objetive and use sigma_score instead,
    seed = 42 # Change for better luck! :)
)

def sigma_score(preds, valid_data):
    df_time = valid_data.params['extra_time']
    labels = valid_data.get_label()
    
#    assert len(labels) == len(df_time)

    x_t = preds * labels #  * df_valid['universe'] -> Here we take out the 'universe' term because we already keep only those equals to 1.
    
    # Here we take advantage of the fact that `labels` (used to calculate `x_t`)
    # is a pd.Series and call `group_by`
    x_t_sum = x_t.groupby(df_time).sum()
    score = x_t_sum.mean() / x_t_sum.std()

    return 'sigma_score', score, True

evals_result = {}
m = lgb.train(lgb_params, dtrain, num_boost_round=1000, valid_sets=(dvalid,), valid_names=('valid',), verbose_eval=25,
              early_stopping_rounds=100, feval=sigma_score, evals_result=evals_result)


df_result = pd.DataFrame(evals_result['valid'])

ax = df_result.plot(figsize=(12, 8))
ax.scatter(df_result['sigma_score'].idxmax(), df_result['sigma_score'].max(), marker='+', color='red')

num_boost_round, valid_score = df_result['sigma_score'].idxmax()+1, df_result['sigma_score'].max()
print(lgb_params)
print(f'Best score was {valid_score:.5f} on round {num_boost_round}')

fig, ax = plt.subplots(1, 2, figsize=(14, 14))
lgb.plot_importance(m, ax=ax[0])
lgb.plot_importance(m, ax=ax[1], importance_type='gain')
fig.tight_layout()

```

---

---

## My Code

``` python






import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')

def preprocessing(market_df, news_df, s=False):

    ## SAMPLE only 2007
    if s:
        mkt = market_df.loc[(market_df.time>='2007-01-01')&(market_df.time<='2007-07-31')]
        nws = news_df.loc[(news_df.time>='2007-01-01')&(news_df.time<='2007-07-31')]
    else:
        mkt = market_df.copy()
        nws = news_df.copy()

    ## MKT will have a value of 1 if the return is positive and 0 otherwise. 

    mkt["target"] = market_df["returnsOpenNextMktres10"] > 0 # .clip(-1, 1)


    # Free memory
    del market_df, news_df


    ## NORMALIZE 
    norm_mkt_labels = ["volume","open","close",'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
                  'returnsClosePrevRaw10', 'returnsOpenPrevRaw10']
    mkt = log_transform(mkt,norm_mkt_labels)
    norm_nws_labels = ["bodySize","sentenceCount","wordCount","firstMentionSentence","sentimentWordCount",
             'noveltyCount12H','noveltyCount24H','noveltyCount3D','noveltyCount5D','noveltyCount7D',
             'volumeCounts12H','volumeCounts24H','volumeCounts3D','volumeCounts5D','volumeCounts7D']
    nws = log_transform(nws,norm_nws_labels)


    ## TRIM COLUMNS

    drop_mkt_feats = ['assetName']
    #                   'returnsOpenNextMktres10',
    #         'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
    #         'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
    #         'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
    #         'returnsClosePrevMktres10','returnsOpenPrevMktres10' ]
    mkt.drop(drop_mkt_feats, axis=1, inplace=True)

    drop_nws_feats = ['headline','sourceId','sourceTimestamp', 'firstCreated',
    'provider', 'subjects', 'audiences','headlineTag', 'assetName', 
    'takeSequence']
    #     'noveltyCount12H', 'noveltyCount24H',
    #     'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
    #     'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D', 'volumeCounts7D']
    nws.drop(drop_nws_feats, axis=1, inplace=True)


    ## CONSOLIDATE TIME
        # i.e: 2007-01-01 22:00:01+00:00 -> 2007-01-02 
        #      2007-01-01 21:59:59+00:00 -> 2007-01-01

    if mkt.time.dtype != 'datetime64[ns, UTC]':
        from dateutil import parser
        mkt.time = mkt.time.apply(lambda x: parser.parse(x))
        nws.time = nws.time.apply(lambda x: parser.parse(x))
    else:
        nws['time'] = (nws['time'] - np.timedelta64(22,'h')).dt.ceil('1D') #.dt.date 
        mkt['time'] = mkt['time'].dt.floor('1D')

    # --- PRACTICE
    ## Reduce the number of features for practicity sake

    nws_prediction_col = nws.columns
    #['assetCodes','time','sentimentWordCountLog','sentimentNegative','sentimentNeutral', 'sentimentPositive']
    mkt_prediction_col = mkt.columns
    #['assetCode','time','target','volumeLog','closeLog']
    mkt = mkt.loc[:,mkt_prediction_col]
    nws = nws.loc[:,nws_prediction_col]

    ## Explode the assetcodes in the news dataframe (Need to modify the dictionary to include the necessary feats

    nws['assetCodes'] = nws.assetCodes.apply(lambda a: a.replace('{','').replace('}','').replace("'",'').replace(" ",''))
    print('Beginning to explode asset codes...')
    nws = pd.concat([pd.DataFrame({'time':row['time'],
                                'sentimentWordCount':row['sentimentWordCount'],
                                'sentimentNegative':row['sentimentNegative'],
                                'sentimentNeutral':row['sentimentNeutral'],
                                'sentimentPositive':row['sentimentPositive'], 
                                'noveltyCount12H':['noveltyCount12H'], 
                                'noveltyCount24H':row['noveltyCount24H'],
                                'noveltyCount3D':row['noveltyCount3D'], 
                                'noveltyCount5D':row['noveltyCount5D'], 
                                'noveltyCount7D':row['noveltyCount7D'],
                                'volumeCounts12H':row['volumeCounts12H'],
                                'volumeCounts24H':row['volumeCounts24H'], 
                                'volumeCounts3D':row['volumeCounts3D'], 
                                'volumeCounts5D':row['volumeCounts5D'], 
                                'volumeCounts7D':row['volumeCounts7D']},
                                  index=row['assetCodes'].split(','))              
                        for _, row in nws.loc[:,nws_prediction_col].iterrows()]).reset_index()
    nws.rename(columns={'index':'assetCode'}, inplace=True)


    return nws, mkt
    
def get_training_dataset(nws,mkt):

    # Prepare the news table with the necessary feats
    nws = nws.groupby(['time','assetCode']).median()

    # Join market and news info by time and asset code
    #     Merge with market data
    data = pd.merge(mkt, nws,  how='outer', left_on=['time','assetCode'], right_on = ['time','assetCode'])
    data.dropna(inplace=True)




    # Free memory
    del nws, mkt

    ## FEATURE ENGINEERING 

    data['dow'] = data.time.dt.dayofweek
    data['mnth'] = data.time.dt.month

    data.drop(columns='time', inplace=True)



    # Take care of the assetCode: The assetCode will encode the frequency of its instances. 
    # A high value means little presence of asset in dataset. 
    ac = data.assetCode.value_counts()
    mac = {v:i for i,v in enumerate(ac.index)}
    data.assetCode = data.assetCode.map(mac)

    ## MODEL PREP

    X = data.copy().reset_index()
    X.drop('index', axis=1, inplace=True)
    y = (X.target == True)
    X.drop('target', inplace=True, axis=1)

    print('Dasaset Size:',X.shape)

    training_size = np.floor(X.shape[0]*0.8).astype(int)
    shuffle_index = np.random.permutation(training_size)

    mX_train, mX_test, my_train, my_test = X[:training_size], X[training_size:], y[:training_size], y[training_size:]
    mX_train, my_train = mX_train.loc[shuffle_index], my_train.loc[shuffle_index]

    X_train = mX_train.as_matrix()
    y_train = my_train.as_matrix()

    X_test = mX_test.as_matrix()
    y_test = my_test.as_matrix()
    
    del data, X, y
    
    return X_train, X_test, y_train, y_test

def make_random_predictions(predictions_df):
    predictions_df.confidenceValue = 2.0 * np.random.rand(len(predictions_df)) - 1.0
    
# Takes a dataframe and the columnns as labels that will log transform them.
def log_transform(df,labels):
    for label in labels:
        a = np.min([0.001, int(np.floor(df[[label]].min().values[0]))])
        df[label] = df[[label]].apply(lambda x: np.log(x+1-a))
    return df


```

```python

# First let's import the module and create an environment.
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
(market_train_df, news_train_df) = env.get_training_data()
# You can only iterate through a result from `get_prediction_days()` once
# so be careful not to lose it once you start iterating.
days = env.get_prediction_days()

# make_random_predictions(predictions_template_df)
# env.predict(predictions_template_df)
(market_obs_df, news_obs_df, predictions_template_df) = next(days)

```

```python 

# Takes 10 min to complete
nws, mkt = preprocessing(market_train_df, news_train_df, True)

# Takes less than 5 min 
#nws, mkt = preprocessing(market_obs_df, news_obs_df, False)

```

```python

start = time()
get_training_dataset(nws,mkt)
print('Building training/test set:',time()-start, 'seconds')


```

---

In [None]:
# Model Selection
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
# Train the SDG's X_train with those samples that are labeled 5 and not 5
sgd_clf.fit(X_train, y_train)


# Measuring Accuracy Using Cross-Validation

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds = StratifiedKFold(n_splits=10, random_state=42)
for train_index, test_index in skfolds.split(X_train, y_train):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train[test_index])
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

---

```python
# Get good examples of assetcodes that have at least 2 news in different dates AND present in the market: 

#mkt.loc[mkt.assetCode=='CEA.N']
ac = nws.loc[(nws.time>='2007-01-03 00:00:00+00:00')&(nws.time<'2007-01-04 00:00:00+00:00')].assetCode.values

uc = nws.loc[(nws.time>='2007-01-04 00:00:00+00:00')&(nws.assetCode.isin(ac))].assetCode.unique()

# Asset codes that have more than 1 pieces of news in the same date 
zzz = nws.assetCode.value_counts()
dc = zzz[zzz>5].index.values
ass_codes = [dif_date for dif_date in uc if dif_date in dc]
mkt.loc[mkt.assetCode.isin(ass_codes),'assetCode'].unique()[:10]
```