In [None]:
## Production Stage (With New Feature Engineering)
Use our ensembled model to classify and label the dataset for the AI/ML model. Combining ESS-Grouping feature engineering, this production process serves to provide robust signals to the optimization process.

import pandas as pd
import numpy as np
import string
import re
import pickle
import matplotlib.pyplot as plt
import ta

from sklearn.preprocessing import LabelEncoder
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%run import_library.ipynb

### Import RavenPack ID
Import SEDOL and RavenPack ID for the china financial sector

rp_df = pd.read_excel('inputs/RavenPack_ID_Names.xlsx')
rp_df = standardize_col_names(rp_df)
rp_df = rp_df[rp_df['rp_entity_id'].notnull()].reset_index(drop=True) # remove datasets with no ravenpack id

# only filter for HK Equity due to data constraints (not enough data points) --> therefore 1 to 1 match for rp_entity_id & sedol
rp_df = rp_df[rp_df['cdr_country'].str.contains(r'HK Equity')]
rp_df.reset_index(inplace=True, drop=True)
rp_df

# import labelled dataset from pre-labelled data in RavenPack
import datetime

batch_id = datetime.date.today().strftime("%y%m%d")
df = pd.read_csv(f'outputs/china_news_sentiments.csv', encoding='latin-1')
df = df[['timestamp_tz', 'cleaned_headline', 'class', 'entity_name', 'rp_entity_id', 'group', 'event_sentiment_score']]
df = df[df['cleaned_headline'].notna()]
df.reset_index(inplace=True, drop=True)
df

### Predict Historical Dataset
Machine learning model on NLP Documents

# FOR MACHINE LEARNING METHOD - Sentiment Analysis

# remove rows with null values for cleaned_headline
df['cleaned_headline'].replace('', np.nan, inplace=True)
df.dropna(subset=['cleaned_headline'], inplace=True)
df.reset_index(drop=True, inplace=True)

le = LabelEncoder()
df['class_label'] = le.fit_transform(df['class'])

# Vectorise Corpus
X_final_dtm = tfidf_vect.transform(df.cleaned_headline)

# Model Prediction
result = stacked_model_svm.predict(X_final_dtm)
df['model_score'] = result
df

## Feature Engineering for Event Group
- We will be creating specific ESS scores for certain important events based on the extreme values of the ESS.
- For instance, we will be assessing the top 10 and bottom 10 in terms of the average ESS scores over the entire 20 year duration.

df = event_feature_engineering(df)
df

### Productionalize the Companies
Conduct by entities name for individual firms. Note: consist of 83 RavenPack ID, with 96 Entity Names, ie, 1 RavenPack ID can have multiple entity names. By filtering for only Hong Kong listed equities, we seek to conduct sentiment analysis on these to generalize for financials markets, to obtain a buy hold decision for the sector.

# use only rp hk listed entities
rp_entity_id_list = list(rp_df.rp_entity_id.unique())
len(rp_entity_id_list)

import copy
# conduct monthly aggregation of dataframe of china financial stocks for sentiment analysis

df1 = df.copy()
ldf = []

for i in rp_entity_id_list:    
    
    # segregating into individual companies
    df = df1[(df1['rp_entity_id'] == i)]
    df['timestamp_tz'] = pd.to_datetime(df.timestamp_tz) # set the date based on yymmdd
    df.rename(columns={'timestamp_tz': 'date'}, inplace=True)
    df = df.set_index('date')
    df.sort_index(ascending=True, inplace=True)
    df = df[['event_sentiment_score', 'model_score', 'mergers_event_sentiment_score', 'mergers_model_score', 'product_event_sentiment_score', 'product_model_score', 'earnings_event_sentiment_score', 'earnings_model_score', 'ratings_event_sentiment_score', 'ratings_model_score', 'regulatory_event_sentiment_score', 'regulatory_model_score']]
    
    # monthly aggregation of the data points
    df = aggregate_period(df, 'M', 'mean')
    
    # conduct percentile ranking
    df = event_percentile_rank(df)
    df['rp_entity_id'] = i # to reinstantiate id value
    ldf.append(df)
    
sentiment_df = pd.concat(ldf, axis=0)
sentiment_df.reset_index(inplace=True)
sentiment_df = sentiment_df.merge(rp_df[['sedol', 'rp_entity_id']], on='rp_entity_id', how='right')
sentiment_df = sentiment_df.set_index('date') # set the date
sentiment_df

## Import Price Data
Import price and volume for technical analysis

price_df = pd.read_csv("C:/Users/tmp4lv/Documents/uCloud/px/px_vol_mktcap.csv", index_col=False)
price_df = standardize_col_names(price_df)
price_df['date'] = pd.to_datetime(price_df.date)
price_df = price_df[(price_df.date >= "2000-01-01")]
price_df.reset_index(inplace=True, drop=True)
price_df['sedol'] = price_df['sedol'].astype(str) # to cast the column as strings to prevent mismatch with rp_df
price_df

Note: Some RavenPack ID represents multiple SEDOL, meaning an RP ID can represents multiple SEDOL of the same company name.

price_df = price_df.merge(rp_df[['sedol', 'rp_entity_id']], on='sedol', how='right')
price_df = price_df[price_df['px'].notnull()].reset_index(drop=True) # ignore data points that do not have values
price_df = price_df.set_index('date') # set index

# remove rows with 0 volume in df - mostly due to it being chinese holiday hence lack of data
price_df = price_df[price_df['vol']!=0]
price_df

### Technical Analysis (Momentum Indicator)
Conducting potential technical analysis on pricing data.
1. Relative Strength Index (RSI)

### Relative Strength Index (RSI)
RSI is a technical indicator, and is intended to chart the current and historical strength or weakness of a stock or market based on the closing prices of a recent trading period. It compares the magnitude of recent gains and losses over a specified time period to measure speed and change of price movements of a security. In short, it generates overbought or oversold signals. Good for stable periods with minimal disruptions. Values above 70 == overbought or overvalued (slide below 70 means bearish). Values below 30 == oversold or undervalued (bullish signals). Feature Engineering: below 30 --> potential buy signal, above 70 --> potential sell signal

### Technical Analysis (Volume Indicator)
Conducting potential technical analysis on pricing data.
1. Ease of movement

### Ease of Movement
EVM is a volume based oscillator, indicating the ease with which the prices rise or fall taking into account the volume of the security. Example, price rise on low volume means prices advanced with relative ease, indicating little selling pressure. Positive EVM values imply that the market is moving higher with ease, while negative values indicate an easy decline. Purpose: used to confirm bullish or bearish trend. Increase in price with positive EVM confirms bullish trend, decrease in price with negative EVM confirms bearish trend.

### Technical Analysis (Volatility Indicator)
Conducting potential technical analysis on pricing data.
1. Bollinger Bands

### Bollinger Bands
Bollinger bands are often used to determine overbought and oversold conditions. Indicator focus on price and volatility (could be too biased). Rules: when the price breaks below the band, tend to bounce up, hence it is a buy strategy. when price breaks above the upper band, overbought and due for a pullback. Related to a mean reversion concept of price. FYI, the bands adapt to price expanding and contracting as volatility increases and decreases.

### Technical Analysis (Trend Indicator)
Conducting potential technical analysis on pricing data.
1. Average Directional Movement Index
2. Moving Average Convergence Divergence (MACD)

### Average Directional Movement Index (Trend Indicator)
ADX measure the strength of a trend. The higher the magnitude of ADX, the stronger the trend.

### Moving Average Convergence Divergence (MACD)
MACD shows the relationship between two exponential moving averages of a stock price. Comparing MACD line against signal line (ie, 9-day EMA). MACD Diff indicates that if the value is positive, it signals a bullish outlook and positive momentum. Else, negative indicates bearish outlook and negative momentum.

sedol_list = list(rp_df.sedol.unique())

ldf = []
for i in sedol_list:    
    
    # segregating into individual companies
    df = price_df[(price_df['sedol'] == i)]
    df = df.sort_index(ascending=True) # sort according to the dates
    
    for j in [5, 14, 50]:
        
        # rsi
        df[f'rsi_{j}'] = ta.momentum.rsi(df['px'], j)
        df[f'rsi_{j}'] = df[f'rsi_{j}'].apply(lambda x: 1 if x < 30 else 0)
        
        # ease of movement
        df[f'evm_{j}'] = ta.volume.ease_of_movement(df['px_high'], df['px_low'], df['vol'], j)
        
        # bollinger bands
        # bollinger high band - indicator shows if the band has been surpassed
        df[f'bol_hband_{j}'] = ta.volatility.bollinger_hband_indicator(df['px'], j)

        # bollinger low band - indicator shows if the band has been surpassed
        df[f'bol_lband_{j}'] = ta.volatility.bollinger_lband_indicator(df['px'], j)

        # bollinger band width - indicates volatility (falling --> lower volatility, increasing --> higher volatility)
        df[f'bol_wband_{j}'] = ta.volatility.bollinger_wband(df['px'], j)

        # buy signals generated from bollinger bands rules, 1 mean buy, 0 means hold
        df[f'bol_buy_{j}'] = df[f'bol_lband_{j}'].apply(lambda x: 1 if x > 0 else 0)
        
        # adx
        df[f'adx_{j}'] = ta.trend.adx(df['px_high'], df['px_low'], df['px'], window=j)

    # macd
    df[f'macd_12_26'] = ta.trend.macd_diff(df['px'])  

    ldf.append(df)
    
price_df = pd.concat(ldf, axis=0)
price_df

### Visualisations for Price Dataset

# visualise sample price dataset

temp_df = price_df[price_df['sedol'] == "B154564"]


import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(y=temp_df["px"], x=temp_df.index, name="msci"))
fig.update_layout(
    hovermode='x',
    title=f"Daily Time Series for B154564",
    xaxis_title="Time Period",
    yaxis_title="Price",
    autosize=True,
)
fig.update_xaxes(rangeslider_visible=True, showgrid=True, gridwidth=1, gridcolor='#ECECEC', zeroline=True, zerolinecolor='lightgrey')
fig.update_yaxes(automargin=True)

### Calculate Periodic Returns + Combine with Technical Indicators
As RP_ENTITY_ID can represents multiple SEDOL, hence we focus on the SEDOL instead of RP_ENTITY_ID when combined with price and volume datasets. To calculate periodic returns:
- Do a preiodic pricing of the returns (use the last of the daily return for the month)
- Use mean of average trading volume

# conduct monthly aggregation of dataframe of china financial stocks for returns and volume averaging
# join with technical indicators as well

ldf = []
for i in sedol_list:    
    
    # segregating into individual companies
    price_df_filtered = price_df[(price_df['sedol'] == i)]
    
    if price_df_filtered.empty:
        print(f"=== Sedol {i} is empty ====")
    
    df = price_df_filtered.copy().sort_index(ascending=True)
    
    sentiment_df_filtered = sentiment_df[(sentiment_df['sedol'] == i)]
    sentiment_df_filtered.drop(['sedol'], axis=1, inplace=True) # for merging and prevent duplicated sedol
    
    if sentiment_df_filtered.empty:
        print(f"=== Sedol {i} is empty for sentiment df ====")
    
    df_month_returns = aggregate_period(df[['px']], 'M', 'last')
    df_month_returns['returns'] = df_month_returns['px']/df_month_returns['px'].shift(1) - 1
    df_month_returns.dropna(subset=['returns'], inplace=True)
    df_month_returns['returns_lead_1'] = df_month_returns['returns'].shift(-1)
    
    df_month_returns['sedol'] = i # assign sedol, to allow the merge function to work even without temp_sentiment
    df_month_vol = aggregate_period(df[['vol']], 'M', 'mean')
    
    # merge on right to keep close to the price and vol, instead of using sentiment as a base case
    df = df_month_returns.merge(df_month_vol, left_index=True, right_index=True, how='left').merge(sentiment_df_filtered, left_index=True, right_index=True, how='left')
    
    # imputation - mainly for new data points (not sure if its good)
    df = df.ffill(axis=0).bfill(axis=0)
    
    
    # for technical indicators (ie, price_df)
    ldf_2 = []

    for i in [5, 14, 50]:

        # sum for binary variables
        df_sum = aggregate_period(price_df_filtered[[f'rsi_{i}', f'evm_{i}', f'bol_buy_{i}']], 'M', 'sum')

        # mean for continuous variables
        df_mean = aggregate_period(price_df_filtered[[f'bol_wband_{i}', f'adx_{i}']], 'M', 'mean')

        ldf_2.append(df_sum)
        ldf_2.append(df_mean)

    # mean for macd variable
    df_macd_mean = aggregate_period(price_df_filtered[[f'macd_12_26']], 'M', 'mean')
    ldf_2.append(df_macd_mean)

    final_df = pd.concat(ldf_2, axis=1)
    df = df.merge(final_df, left_index=True, right_index=True, how='left')
    
    ldf.append(df)
    
china_df = pd.concat(ldf, axis=0)
china_df

# check for any nan values
china_df[china_df.isnull().any(1)].sort_values('date')

# sedol list
ldf = []

for i in sedol_list:
    
    macd_df = macd_feature_engineering(china_df, i)
    ldf.append(macd_df)
    
china_df = pd.concat(ldf, axis=0)
china_df 

### Combine Datasets With News Volume Spikes

df_news_vol_spikes =  pd.read_csv("outputs/china_news_volume_spikes.csv", encoding='latin-1')
df_news_vol_spikes['date'] = pd.to_datetime(df_news_vol_spikes.date) # convert to datetime objects
df_news_vol_spikes.set_index('date', inplace=True) # set index
df_news_vol_spikes

# left join with existing dataframe

ldf = []
for i in rp_entity_id_list:    
    
    # segregating into individual companies
    temp_china_df = china_df[(china_df['rp_entity_id'] == i)]
    
    temp_df_news_vol_spikes = df_news_vol_spikes[(df_news_vol_spikes['rp_entity_id'] == i)]
    df = temp_china_df.merge(temp_df_news_vol_spikes.drop(['rp_entity_id'], axis=1), left_index=True, right_index=True, how='left')
    
    ldf.append(df)
    
final_df = pd.concat(ldf, axis=0)
final_df

### Macroeconomics Variables
Use China Data

macro_df = pd.read_csv('inputs/country_macro.csv')
cn_df = macro_df[(macro_df['MSCI_COUNTRY'] == "China")] # not used at the moment

# formatting: convert to datetime objects
cn_df['Periods'] = pd.to_datetime(cn_df.Periods)
cn_df.sort_values(by=['Periods'], ascending=True, inplace=True)
cn_df.reset_index(inplace=True, drop=True)
cn_df = cn_df[['Periods', 'GDP_GROWTH', 'CPI_GROWTH']] # remove unemployment due to no data
cn_df.rename(columns={'GDP_GROWTH': "gdp_growth_cn", 'CPI_GROWTH': 'cpi_growth_cn', "Periods": "date"}, inplace=True)
cn_df = cn_df.ffill()
cn_df.set_index('date', inplace=True, drop=True)
cn_df.index = cn_df.index + MonthEnd(0)

# us macro dataframe
us_df = macro_df[(macro_df['MSCI_COUNTRY'] == "United States")] # used

# formatting: convert to datetime objects
us_df['Periods'] = pd.to_datetime(us_df.Periods)
us_df.sort_values(by=['Periods'], ascending=True, inplace=True)
us_df.reset_index(inplace=True, drop=True)
us_df = us_df[['Periods', 'GDP_GROWTH', 'CPI_GROWTH', 'UNEMPLOYMENT_RATE']]
us_df.rename(columns={'GDP_GROWTH': "gdp_growth_us", 'CPI_GROWTH': 'cpi_growth_us', "UNEMPLOYMENT_RATE": 'unemployment_rate_us', "Periods": "date"}, inplace=True)
us_df = us_df.ffill()
us_df.set_index('date', inplace=True, drop=True)
us_df.index = us_df.index + MonthEnd(0)

# Merge with CN Market

final_df = final_df.merge(cn_df, left_index=True, right_index=True)
final_df = final_df.merge(us_df, left_index=True, right_index=True)

print(f"=== Before: Checking for Null Values: {len(final_df[final_df.isnull().any(1)].sort_values('date'))} qty of null values ===")
final_df.dropna(subset=['cpi_growth_cn'], inplace=True) # drop nan values from china cpi data
print(f"=== After: Checking for Null Values: {len(final_df[final_df.isnull().any(1)].sort_values('date'))} qty of null values ===")
final_df

### Save to pickle dataframes files

# # save model to pickle
# import pickle

# # save the model to disk
# filename = 'outputs/finalised_df/df_cn_month.pickle'
# dbfile =  open(filename, 'wb')
# pickle.dump(final_df, dbfile)
# dbfile.close()

# End of Production Codes