In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
from tqdm import tqdm
import time
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [9]:
index_code = '000852.XSHG'
index_short_code = 'zz1000'
index_price = pd.read_csv('index_price_1d.csv')
index_price = index_price[index_price['order_book_id'] == index_code]
index_price['date'] = pd.to_datetime(index_price['date'])

index_weight = pd.read_csv(f'{index_short_code}_weight.csv', index_col=0) 
index_weight.columns = ['order_book_id', 'weight', 'date']
index_weight['date'] = pd.to_datetime(index_weight['date'], format='%Y%m%d')

stock_price = pd.read_csv(f'{index_short_code}_1d.csv', index_col=0) 
stock_price['date'] = pd.to_datetime(stock_price['date'])

# get sector map pickle 
sector_map = pickle.load(open('sec_map.pkl', 'rb'))

stk_sec_map = {}
for sec, stk_list in sector_map.items():
    for stk in stk_list:
        stk_sec_map[stk] = sec

merged = stock_price.merge(index_weight, on=['order_book_id', 'date'], how='left')

merged['GICS'] = merged['order_book_id'].apply(lambda x: stk_sec_map.get(x, None))

merged = merged.sort_values(by=['date', 'weight'], ascending=[True, False])

merged['return'] = merged.groupby('order_book_id')['close'].pct_change()


In [10]:
industry_corr = {}
for industry in merged['GICS'].unique():
    # calculate pair wise correlation within the index
    corr_dict ={}
    for date_corr in merged['date'].unique()[20:]:

        stock_list = merged[(merged.date==date_corr) & (~merged.weight.isna()) & (merged.GICS==industry)]['order_book_id'].unique()

        # get pairwise correlation for previous 20 days
        corr = merged[merged['order_book_id'].isin(stock_list)&
                        (merged['date'] <= date_corr)].groupby('order_book_id')\
                            .tail(20).pivot(index='date', columns='order_book_id', values='return')\
                                .corr().stack().replace(1, np.nan).dropna().mean()

        corr_dict[date_corr] = corr

    # corr_df = pd.DataFrame(corr_dict.items(), columns=['date','corr'])
    industry_corr[industry] = corr_dict

industry_corr_df = pd.DataFrame(industry_corr)

industry_corr_df.to_csv(f'industry_corr_{index_short_code}.csv')


industry_corr_df = pd.read_csv(f'industry_corr_{index_short_code}.csv', index_col=0)
industry_corr_df = industry_corr_df[list(sector_map.keys())]

# smoothed correlation by rolling 5 days
industry_corr_df = industry_corr_df.rolling(20).mean()
industry_corr_df 

Unnamed: 0,Energy,Materials,ConsumerDiscretionary,ConsumerStaples,HealthCare,Financials,RealEstate,InformationTechnology,TelecommunicationServices,Utilities,Industrials
2020-02-07,,,,,,,,,,,
2020-02-10,,,,,,,,,,,
2020-02-11,,,,,,,,,,,
2020-02-12,,,,,,,,,,,
2020-02-13,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2025-02-24,0.551424,0.359082,0.344063,0.431107,0.414060,0.460395,0.441858,0.468235,0.513580,0.427585,0.373851
2025-02-25,0.544169,0.344849,0.333769,0.423531,0.406627,0.439206,0.441933,0.459101,0.498507,0.414583,0.358215
2025-02-26,0.539919,0.331908,0.325289,0.419190,0.399967,0.419077,0.446676,0.451574,0.485109,0.402127,0.343273
2025-02-27,0.535957,0.318816,0.315914,0.415415,0.393105,0.400900,0.447133,0.443920,0.470637,0.391289,0.328710


In [None]:
fig = px.line(industry_corr_df, x=industry_corr_df.index, y=industry_corr_df.columns)

colors = px.colors.qualitative.Plotly



fig.update_layout(
    title="Industry Correlation",
    xaxis_title="Date",
    yaxis_title="Correlation",
    legend_title="Legend Title",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)

# update color
for i, trace in enumerate(fig.data):
    trace.line.color = colors[i%len(colors)]

# set length and width
fig.update_layout(
    autosize=False,
    width=1200,
    height=500,
)
fig.show()

fig.write_image("ind_corr(20d).png")

# draw stacked area chart
daily_GICS_weight = merged[~merged.weight.isna()].groupby(['date','GICS'])['weight'].sum().reset_index()
daily_GICS_weight['weight'] = daily_GICS_weight['weight'] / daily_GICS_weight.groupby('date')['weight'].transform('sum')
fig = px.area(daily_GICS_weight, x='date', y='weight', color='GICS', line_group='GICS')
fig.update_layout(title=f'{index_short_code} Sector Weight', xaxis_title='Date', yaxis_title='Weight')
fig.show()



# draw stacked area chart number of stock in each sector
daily_GICS_weight = merged[~merged.weight.isna()].groupby(['date','GICS'])['order_book_id'].count().reset_index()

fig = px.area(daily_GICS_weight, x='date', y='order_book_id', color='GICS', line_group='GICS')
fig.update_layout(title=f'{index_short_code} Sector Stock Count', xaxis_title='Date', yaxis_title='Stock Count')
fig.show()

# financial sector stock outperformance within section. long short.

In [None]:

corr_dict ={}
for date_corr in merged['date'].unique()[20:]:

    stock_list = merged[(merged.date==date_corr) & (~merged.weight.isna())]['order_book_id'].unique()

    # get pairwise correlation for previous 20 days
    corr = merged[merged['order_book_id'].isin(stock_list)&
                    (merged['date'] <= date_corr)].groupby('order_book_id')\
                        .tail(20).pivot(index='date', columns='order_book_id', values='return')\
                            .corr().stack().replace(1, np.nan).dropna().mean()

    corr_dict[date_corr] = corr

corr_df = pd.DataFrame(corr_dict.items(), columns=['date','corr'])



corr_df.to_csv(f'all_corr_{index_short_code}.csv')

corr_df = pd.read_csv(f'all_corr_{index_short_code}.csv', index_col=0)


index_price['return'] = index_price['close'].pct_change()
index_price['culmulative_return'] = (1+index_price['return']).cumprod()

fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add trace for correlation
fig.add_trace(
    go.Scatter(x=corr_df['date'], y=corr_df['corr'], mode='lines', name='Correlation'),
    secondary_y=False,
)

# Add trace for cumulative return
fig.add_trace(
    go.Scatter(x=index_price['date'], y=index_price['culmulative_return'], mode='lines', name='Cumulative Return'),
    secondary_y=True,
)

# Update layout
fig.update_layout(
    title="Pairwise Correlation ",
    xaxis_title="Date",
    legend_title="Legend Title",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ),
    autosize=False,
    width=1200,
    height=500,
)

# Set y-axes titles
fig.update_yaxes(title_text="Correlation", secondary_y=False)
fig.update_yaxes(title_text="Cumulative Return (%)", secondary_y=True)

fig.show()

# Save to png
fig.write_image("corr_cumreturn(20d).png")

# strategy 1: when correlation touch top/bottom percentile, the index return is likely to reverse,  buy those, short index future
# strategy 2: for thouse sectos with less pairwise correaltion, if some stock follows the market, than the other stock are likely to follow the reverse direction


# 202105, 202202 why pairwise correlation same, but index return different, when will reverse effective
# compare 500 1000 300, see why difference