In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from itertools import combinations, product
from functools import partial
from multiprocessing import Pool, Manager, cpu_count
from IPython.display import display_html
from sklearn.utils import shuffle
import plotly.express as px
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import r2_score
import pickle
import time
sns.set_context("notebook")

import warnings
warnings.filterwarnings("ignore")
pd.set_option('max_column', None)

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=42
seed_everything(SEED)

In [None]:
# Prices
df_train = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
df_train["Date"] = pd.to_datetime(df_train["Date"])
# Stocks List
stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
stock_list['NewMarketSegment'] = stock_list['NewMarketSegment'].str.replace(" \(Foreign Stock\)","")
# Financials
financials = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv")[['Date', 'SecuritiesCode', 'TypeOfDocument','TypeOfCurrentPeriod']]
financials["Date"] = pd.to_datetime(financials["Date"])
financials_train = pd.concat([financials['TypeOfDocument'].str.split('_', expand=True), financials], axis = 1)
financials_train[0] = financials_train[0].str.replace('1Q','').str.replace('2Q','').str.replace('3Q','').str.replace('FY','')
financials_train = pd.get_dummies(financials_train, columns=[0,1,2,'TypeOfCurrentPeriod']).drop(columns=['TypeOfDocument'])

In [None]:
def day_simulations(df_temp_prediction, weights, NUM_SIMULATIONS = 10000):
    """
    Args:
        df_temp_prediction (pd.DataFrame): daily df with Target prediction
        weights (list): # weights
        NUM_SIMULATIONS (int): Number of iteration to simulate different scenarios
    Returns:
        (dict): simulation results
    """
    dict_simulations = {}
    for i in range(NUM_SIMULATIONS): 
        df_temp_prediction_sim = shuffle(df_temp_prediction.copy()).reset_index(drop=True)
        df_temp_prediction_sim['Rank'] = df_temp_prediction_sim.index.astype(int)

        # Day Spread Estimation
        w_temp_BUY = (np.array(df_temp_prediction_sim['Target'][:200]) * weights).sum() / weights.mean()
        w_temp_SHORT = (np.array(df_temp_prediction_sim['Target'][-200:])[::-1] * weights).sum() / weights.mean()
        spread = w_temp_BUY - w_temp_SHORT
        
        dict_simulations[i] = (
            df_temp_prediction_sim, 
            spread, 
            None # useless
            ) 

    return dict_simulations

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
df_rolling = df_train.drop(columns=['Target'], axis = 1).copy()
df_rolling = df_rolling[df_rolling['Date'].isin(sorted(df_rolling['Date'].unique(), reverse=True)[:16])]

x = [0,1]
POLYs = [1] # Degrees of the polynomial to be averaged
LAGs = [3,4,5,6,7] # Past intervals to be mediated in the forecast
TARGET_SPREAD_VALUE = 0.4 #41853716597983


previous_selection_frames = []
previous_prices_frames = []
real_spread_history = []

for (prices, _, financials, _, _, sample_prediction) in iter_test: # Iterate over days
    #print(prices.Date.iloc[0])
    
    df_rolling = pd.concat([df_rolling, prices.copy()]).sort_values(["SecuritiesCode", "Date"])
    df_rolling['is_no_price'] = 0
    df_rolling.loc[np.isnan(df_rolling['Close']), 'is_no_price'] = 1
    df_rolling = df_rolling.ffill()
    df_rolling['Date'] = pd.to_datetime(df_rolling['Date'])
    
    df_rolling_last = df_rolling.copy()

    # Append Closed price shifted to df
    for shift_temp in reversed(range(1,8)):
        df_rolling_last[f"shift_{shift_temp}"] = df_rolling_last.groupby('SecuritiesCode')['Close'].shift(shift_temp)

    df_rolling_last_new = df_rolling_last[df_rolling_last['Date'] == prices.Date.iloc[0]]
    df_rolling_last_new['shift_0'] = df_rolling_last_new['Close']

    target_columns = []
    r2_columns = []
    for grado_poly in POLYs: # For each polynomial degrees
        for lag in LAGs: # For eache LAGs
            baseline_array = []
            historical_shift_columns = []
            for i in range(lag):
                baseline_array.append(i)
                historical_shift_columns.append(f'shift_{lag-i-1}')

            temp_array = np.array(df_rolling_last_new[historical_shift_columns])

            df_rolling_last_new[f'closed_poly{grado_poly}_{lag}'] = [np.poly1d(np.polyfit(np.array(baseline_array),arr,grado_poly))(lag) for arr in temp_array] # Poly fit 1d for day + 1 
            df_rolling_last_new[f'closed_poly{grado_poly}_{lag}+1'] = [np.poly1d(np.polyfit(np.array(baseline_array),arr,grado_poly))(lag+1) for arr in temp_array] # Poly fit 1d for day + 2
            df_rolling_last_new[f'poly{grado_poly}_{lag}_r2'] = [r2_score(arr, np.poly1d(np.polyfit(np.array(baseline_array),arr,grado_poly))(baseline_array)) for arr in temp_array] # R2 Error on past

            # Fix no sense drop 
            df_rolling_last_new.loc[df_rolling_last_new[f'closed_poly{grado_poly}_{lag}'] <= 0, f'closed_poly{grado_poly}_{lag}'] = df_rolling_last_new[df_rolling_last_new[f'closed_poly{grado_poly}_{lag}'] <= 0]['shift_0']
            df_rolling_last_new.loc[df_rolling_last_new[f'closed_poly{grado_poly}_{lag}+1'] <= 0, f'closed_poly{grado_poly}_{lag}+1'] = df_rolling_last_new[df_rolling_last_new[f'closed_poly{grado_poly}_{lag}+1'] <= 0][f'closed_poly{grado_poly}_{lag}']
            
            # Calcolate current estimated Target
            df_rolling_last_new[f'target_poly{grado_poly}_lag_{lag}'] = (df_rolling_last_new[f'closed_poly{grado_poly}_{lag}+1'] - df_rolling_last_new[f'closed_poly{grado_poly}_{lag}']) / df_rolling_last_new[f'closed_poly{grado_poly}_{lag}']
            
            target_columns.append(f'target_poly{grado_poly}_lag_{lag}')
            r2_columns.append(f'poly{grado_poly}_{lag}_r2')

    # Ensamble of target results by weighted average on R2 error on past data
    target_matrix_w_weights = (np.array(df_rolling_last_new[target_columns]) * np.array(df_rolling_last_new[r2_columns])).sum(axis = 1)
    sum_weights = np.array(df_rolling_last_new[r2_columns]).sum(axis = 1)
    df_rolling_last_new['predicted_target'] = target_matrix_w_weights/sum_weights 
    
    df_rolling_last_new.loc[df_rolling_last_new['is_no_price'] == 1, 'predicted_target'] = 0.0 # Without Close Price set to 0

    target_map = df_rolling_last_new.groupby('SecuritiesCode')['predicted_target'].mean() 
    
    ## Apply the estimated targets for each stock
    sample_prediction_new = sample_prediction.copy()
    sample_prediction_new['Target'] = sample_prediction_new['SecuritiesCode'].map(target_map.to_dict())
    sample_prediction_new.fillna(0, inplace = True) # Set to 0 if mapping fails
    sample_prediction_new = sample_prediction_new.drop(columns = ['Rank'], axis = 1) # Removal of the Rank dummy
    
    # Simulation of n scenarios to find the one with the best ranking (with respect to best spread analysis)
    dict_simulations = day_simulations(sample_prediction_new, weights = np.linspace(start=2, stop=1, num=200), NUM_SIMULATIONS = 10000) # Increase the number of simulations for a better result (slower execution)
    
    # Selection of the best simulation based on the distance to the established spread target value
    spread_simulations = [abs(v[1] - random.uniform(0.3,0.5)) for v in dict_simulations.values()]
    best_idx_list = np.argsort(spread_simulations) 
    best_combination = dict_simulations[best_idx_list[0]][0] # Pick the best combination of spread 
    
    # Apply Rank
    sample_prediction['Rank'] = sample_prediction["SecuritiesCode"].map(pd.Series(best_combination.Rank.values, index=best_combination.SecuritiesCode).to_dict())
    
    # Update environment with predicted Rank
    env.predict(sample_prediction)

    previous_prices_frames.append(prices.copy()[['Date', 'SecuritiesCode', 'Close']])
    previous_selection_frames.append(sample_prediction.copy())
    
    