In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.metrics import mean_squared_error as mse

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
  prices['Date'] = pd.to_datetime(prices['Date'])
  raw_df = prices.sort_values(by='Date', ignore_index=True)
  raw_df['AdjClose'] = raw_df['Close']*raw_df['AdjustmentFactor']
  raw_df['AdjOpen'] = raw_df['Open']*raw_df['AdjustmentFactor']
  raw_df['Diff'] = raw_df['High']-raw_df['Low']
  raw_df['PercGain'] = ((raw_df['AdjClose']-raw_df['AdjOpen'])/raw_df['AdjOpen'])

  data = raw_df[['Date','SecuritiesCode','Diff', 'Volume','PercGain']]
  codes = data['SecuritiesCode'].unique()

  mms = MinMaxScaler(feature_range=(0,100))
  data['Diff'] = mms.fit_transform(data['Diff'].values.reshape(-1,1))
  data['Volume'] = mms.fit_transform(data['Volume'].values.reshape(-1,1))
  data['PercGain'] = mms.fit_transform(data['PercGain'].values.reshape(-1,1))

  def ranking_tplus2(data, current_date, days=30, codes=codes):
    mses = []
    ranking_dict = dict([])

    df = pd.DataFrame(columns = ['Date', 'SecuritiesCode', 'Ratio', 'Rank'])

    def stds(data, codes):
      std_list = []
      for i in range(0, len(codes)):
        newdat = data[data.SecuritiesCode == (codes[i])]
        if np.isnan(newdat).sum().sum() > 0:
          newdat = newdat.fillna(method='Bfill')
          newdat = newdat.fillna(method='Ffill')
        std = np.std(newdat['PercGain'])
        std_list.append(std)
      return std_list

    std_list = stds(data, codes)
      
    for l in range(0, len(codes)):
      new_dat = data[data.SecuritiesCode == (codes[l])].reset_index().drop(columns='index')
      if new_dat.index[new_dat.Date == current_date].tolist() == []:
        print('Stock id', codes[l], 'does not have any data for day', current_date)
      else:
        idx = (new_dat.index[new_dat.Date == current_date].tolist())[0]-days-2
        new_dat = new_dat[['Diff', 'Volume', 'PercGain']]

        for k in range(0, len(new_dat)):
          if np.isnan(new_dat.iloc[k]).sum() > 0:
            new_dat = new_dat.fillna(method='Bfill')
            new_dat = new_dat.fillna(method='Ffill')

        xtrain, ytrain = [], []
        for i in range(days, (len(new_dat)-2)):
          xtrain.append(new_dat[(i-days):i])
          ytrain.append(new_dat.iloc[i+2][2])
        xtrain, ytrain = np.array(xtrain), np.array(ytrain)
        xtrain = np.reshape(xtrain, (xtrain.shape[0], 1, xtrain.shape[1]*xtrain.shape[2]))

        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
        model = Sequential()
        model.add(LSTM(units=64, return_sequences=True, input_shape=(xtrain.shape[1], xtrain.shape[2])))
        model.add(Dropout(0.1))
        model.add(LSTM(units=64, return_sequences=False))
        model.add(Dropout(0.1))
        model.add(Dense(units=32))
        model.add(Dense(units=1))
        model.compile(optimizer=optimizer, loss='mse')

        model.fit(xtrain, ytrain, epochs=40, batch_size=64, validation_split=0.25, shuffle=True, verbose=0)

        onextest = xtrain[idx] 
        oneytest = ytrain[idx] 

        oneypred = model.predict(np.array([onextest]))
        oneypred = oneypred.flatten()

        check = ytrain[idx+1]
        per_gain_diff = oneypred-check

        ratio = per_gain_diff/std_list[l]
        penalty = (mse(model.predict(xtrain), ytrain))/100
        ratio = ratio-penalty

        df = df.append({'Date': current_date, 'SecuritiesCode': codes[l], 'Ratio': ratio}, ignore_index=True)

    df.sort_values(by=['Ratio'], ascending=False)
    for i in range(0,200):
      df.at[i, 'Rank'] = i
      df.at[(i+1800), 'Rank'] = (i+1800)
    df = df.dropna()

    return df

    prediction = ranking_tplus2(data=data, current_date='2022-05-27')

    prediction[['Date', 'SecuritiesCode', 'Rank']] = np.arange(len(prediction))
    env.predict(prediction)