In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors = px.colors.qualitative.Plotly

In [2]:
prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv",
                     parse_dates=["Date"])
train = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv", parse_dates=['Date'])
stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")

df_pred = prices.copy()
df_pred['Close_adj'] = df_pred.groupby('SecuritiesCode').apply(
    lambda s: s['Close'] / s['AdjustmentFactor'].cumprod().shift().fillna(1)).reset_index('SecuritiesCode', drop=True)

In [3]:
df_pred.head()

In [4]:
class parse():  
    def moving_average(df_pred, window1=5, window2=25):
        """
        The closing price is the raw price, which is just the cash value of the last
        transacted price before the market closes. The adjusted closing price factors
        in anything that might affect the stock price after the market closes.
        """
        df_pred['mov_av_5'] = df_pred.Close_adj.rolling(window1, min_periods=1).mean()
        df_pred['mov_av_25'] = df_pred.Close_adj.rolling(window2, min_periods=1).mean()
        df_pred['crossing_mov_av'] = (np.where(
            ((df_pred['mov_av_5'] - df_pred['mov_av_25']) > 0) & ((df_pred['mov_av_5'] - df_pred['mov_av_25']) < 0).shift().fillna(
                False), 1,
            np.where(
                ((df_pred['mov_av_5'] - df_pred['mov_av_25']) < 0) & ((df_pred['mov_av_5'] - df_pred['mov_av_25']) > 0).shift().fillna(
                    False), -1, 0)))

        return df_pred

    def displaced_moving_average(df_pred):  
        """
        A displaced moving average (DMA) is a moving average (MA) that has been adjusted
        forward or back in time in an attempt to better forecast trends or better fit the
        price movements of an asset.
        """
        df_pred['dis_mov_av'] = df_pred['Close_adj'] / df_pred['mov_av_25'] - 1
        return df_pred

    def divergence(df_pred, window=25):  
        std = df_pred['Close_adj'].rolling(window, min_periods=1).std()
        mean = df_pred['Close_adj'].rolling(window, min_periods=1).mean()
        df_pred['Div'] = (df_pred['Close_adj'] - mean) / std
        return df_pred

    def difference(df_pred):  
        (df_pred['Close'] - df_pred['Open'])/df_pred[['Close', 'Open']].mean(axis=1)
        return df_pred

    def difference_MA(df_pred):  
        df_pred['Diff_MA1'] = df_pred['Close_adj'] - df_pred['mov_av_5']
        df_pred['Diff_MA2'] = df_pred['Close_adj'] - df_pred['mov_av_25']
        return df_pred

In [5]:
df_pred = (df_pred.groupby('SecuritiesCode').apply(parse.moving_average))
df_pred = (df_pred.groupby('SecuritiesCode').apply(parse.displaced_moving_average))
df_pred = (df_pred.groupby('SecuritiesCode').apply(parse.divergence))
df_pred = (df_pred.groupby('SecuritiesCode').apply(parse.difference))
df_pred = (df_pred.groupby('SecuritiesCode').apply(parse.difference_MA))

In [6]:
df_pred.head()

In [7]:
train_date = train.Date.unique()
returns = train.groupby('Date')['Target'].mean().mul(100).rename('Average Return')
close_avg = train.groupby('Date')['Close'].mean().rename('Closing Price')
vol_avg = train.groupby('Date')['Volume'].mean().rename('Volume')

fig = make_subplots(rows=3, cols=1,
                    shared_xaxes=True)
for i, j in enumerate([returns, close_avg, vol_avg]):
    fig.add_trace(go.Scatter(x=train_date, y=j, mode='lines',
                             name=j.name, marker_color=colors[i]), row=i + 1, col=1)
fig.update_xaxes(rangeslider_visible=False,
                 rangeselector=dict(
                     buttons=list([
                         dict(count=6, label="6m", step="month", stepmode="backward"),
                         dict(count=1, label="1y", step="year", stepmode="backward"),
                         dict(count=2, label="2y", step="year", stepmode="backward"),
                         dict(step="all")])),
                 row=1, col=1)
fig.update_layout(template=temp, title='JPX Market Average Stock Return, Closing Price, and Shares Traded',
                  hovermode='x unified', height=700,
                  yaxis1=dict(title='Stock Return', ticksuffix='%'),
                  yaxis2_title='Closing Price', yaxis3_title='Shares Traded',
                  showlegend=False)
fig.show()

In [8]:
stock_list['SectorName']=[i.rstrip().lower().capitalize() for i in stock_list['17SectorName']]
stock_list['Name']=[i.rstrip().lower().capitalize() for i in stock_list['Name']]
train_df = train.merge(stock_list[['SecuritiesCode','Name','SectorName']], on='SecuritiesCode', how='left')
train_df['Year'] = train_df['Date'].dt.year
years = {year: pd.DataFrame() for year in train_df.Year.unique()[::-1]}
for key in years.keys():
    df=train_df[train_df.Year == key]
    years[key] = df.groupby('SectorName')['Target'].mean().mul(100).rename("Avg_return_{}".format(key))
df=pd.concat((years[i].to_frame() for i in years.keys()), axis=1)
df=df.sort_values(by="Avg_return_2021")

fig = make_subplots(rows=1, cols=5, shared_yaxes=True)
for i, col in enumerate(df.columns):
    x = df[col]
    mask = x<=0
    fig.add_trace(go.Bar(x=x[mask], y=df.index[mask],orientation='h', 
                         text=x[mask], texttemplate='%{text:.2f}%',textposition='auto',
                         hovertemplate='Average Return in %{y} Stocks = %{x:.4f}%',
                         marker=dict(color='red', opacity=0.7),name=col[-4:]), 
                  row=1, col=i+1)
    fig.add_trace(go.Bar(x=x[~mask], y=df.index[~mask],orientation='h', 
                         text=x[~mask], texttemplate='%{text:.2f}%', textposition='auto', 
                         hovertemplate='Average Return in %{y} Stocks = %{x:.4f}%',
                         marker=dict(color='green', opacity=0.7),name=col[-4:]), 
                  row=1, col=i+1)
    fig.update_xaxes(range=(x.min()-.15,x.max()+.15), title='{} Returns'.format(col[-4:]), 
                     showticklabels=False, row=1, col=i+1)
fig.update_layout(template=temp,title='Yearly Average Stock Returns by Sector', 
                  hovermode='closest',margin=dict(l=250,r=50),
                  height=600, width=1000, showlegend=False)
fig.show()

In [9]:
pal = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, 18)]
fig = go.Figure()
for i, sector in enumerate(df.index[::-1]):
    y_data=train_df[train_df['SectorName']==sector]['Target']
    fig.add_trace(go.Box(y=y_data*100, name=sector,
                         marker_color=pal[i], showlegend=False))
fig.update_layout(template=temp, title='Target Distribution by Sector',
                  yaxis=dict(title='Stock Return',ticksuffix='%'),
                  margin=dict(b=150), height=750, width=900)
fig.show()

In [None]:
%%time
#cat_features=list_cat

models = {}
for code, d in df.groupby("SecuritiesCode"):
    d = d[~d.Target.isnull()]
    X = d[columns]
    y = d.Target
    model = train_model(X, y)#,cat_features)
    models[code] = model
    print(code, model.score(X,y))