# Import & Data Exploration 


In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize

data = pd.read_csv('data.csv')

data_info = data.info()
first_few_rows = data.head()

data_info, first_few_rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451113 entries, 0 to 451112
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ticker  451113 non-null  object 
 1   date    451113 non-null  object 
 2   last    451113 non-null  float64
 3   volume  451113 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 13.8+ MB


(None,
     ticker        date      last   volume
 0  1332 JT  2013-01-04  169.0987  1464100
 1  1332 JT  2013-01-07  166.3266  1783500
 2  1332 JT  2013-01-08  166.3266  1759800
 3  1332 JT  2013-01-09  165.4026   767800
 4  1332 JT  2013-01-10  167.2507  1503100)

In [31]:
print(f'Number of asset : {data["ticker"].nunique()}')
print(f'Number of dates : {data["date"].nunique()}')

Number of asset : 248
Number of dates : 2005


In [32]:
data["date"].unique()

array(['2013-01-04', '2013-01-07', '2013-01-08', ..., '2021-03-17',
       '2021-03-18', '2021-03-19'], dtype=object)

We have daily data from 2013-01-04 to 2021-03-19 of 248 assets, we will first analyse the shape of our data.

In [None]:
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values(by='date')

missing_values = data.isnull().sum()

unique_tickers = data['ticker'].unique()




for ticker in unique_tickers:
    fig, axes = plt.subplots(1,2, figsize=(15, 5))
    asset_data = data[data['ticker'] == ticker]
    
    axes[0].plot(asset_data['date'], asset_data['last'])
    axes[0].set_title(f'{ticker} - Price')
    axes[0].set_xlabel('Date')
    axes[0].set_ylabel('Price')

    axes[1].plot(asset_data['date'], asset_data['volume'])
    axes[1].set_title(f'{ticker} - Volume')
    axes[1].set_xlabel('Date')
    axes[1].set_ylabel('Volume')

    plt.tight_layout()
    plt.show()


In [None]:
for ticker in unique_tickers:
    
    fig, ax = plt.subplots(1,1, figsize=(15, 5))
    asset_data = data[data['ticker'] == ticker]

    returns = asset_data['last'].pct_change().dropna()
    sns.histplot(returns, ax=ax, kde=True)
    ax.set_title(f'{ticker} - Returns Distribution')
    ax.set_xlabel('Returns')

    plt.tight_layout()
    plt.show()

In [35]:
missing_values

ticker    0
date      0
last      0
volume    0
dtype: int64

From the plots, we can observe the following:

The price and volume data vary significantly across different assets but there should be correlation.
Our different assets have different lengths of data, we don't have from 2013 to 2021 for all assets.

The distribution of returns is approximately centered around zero for the large majority.

# Correlation Analysis

In [None]:
price_data = data.pivot(index='date', columns='ticker', values='last')
price_correlation = price_data.corr()

volume_data = data.pivot(index='date', columns='ticker', values='volume')
volume_correlation = volume_data.corr()


plt.figure(figsize=(12, 10))
sns.heatmap(price_correlation, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Price Correlation Matrix')
plt.show()

plt.figure(figsize=(12, 10))
sns.heatmap(volume_correlation, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Volume Correlation Matrix')
plt.show()

The prices let us suggest that there are clusters of different assets that behave the same way. The volume data give us the information that there are some assets that trade at high volumes at the same time but it is not clear.

# The Strategies

We will try  to present two type of work. The first one is a long-term strategy using asset allocation and the second one is a daily trading strategy that use both price and volume information. We will develop simple strategies that can be improved in many ways and also be shaped using our own risk aversion, investment horizon and the quantity of money we want to invest.

## Long-term allocation strategy 

 We will use historical price data to allocate weights to each asset in the portfolio. The goal is to maximize the portfolio's return while minimizing its risk. : mean-variance optimization approach. We do the Markovitz Portafolio optimisation. Other methods such as **'Equally-weighted
risk contributions portfolios'** can be used by changing the 'objective' function.

In [None]:
returns = data.pivot(index='date', columns='ticker', values='last').pct_change()

avg_returns = returns.mean()
cov_matrix = returns.cov()

def objective(weights, avg_returns, cov_matrix, risk_aversion):
    portfolio_return = np.dot(weights, avg_returns)
    portfolio_volatility = np.sqrt(np.dot(weights.T, np.dot(cov_matrix, weights)))
    return -portfolio_return + risk_aversion * portfolio_volatility

num_assets = len(unique_tickers)
initial_weights = np.ones(num_assets) / num_assets # Equally weighted portfolio

constraints = {'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1}

bounds = tuple((0, 1) for asset in range(num_assets))

risk_aversion = 1 # We take 1 as a first test.
optimal_weights = minimize(objective, initial_weights, args=(avg_returns, cov_matrix, risk_aversion),
                           method='SLSQP', bounds=bounds, constraints=constraints)['x']

optimal_return = np.dot(optimal_weights, avg_returns)
optimal_volatility = np.sqrt(np.dot(optimal_weights.T, np.dot(cov_matrix, optimal_weights)))

portfolio_values = (returns * optimal_weights).sum(axis=1).cumsum()

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(portfolio_values.index, portfolio_values)
ax.set_title('Portfolio Values Over Time')
ax.set_xlabel('Date')
ax.set_ylabel('Portfolio Value')

plt.tight_layout()
plt.show()

optimal_weights, optimal_return, optimal_volatility


One potential improvement could be to include a risk-free rate in the optimization to calculate the Sharpe ratio and do a out-of-sample test to see if the strategy is robust using a risk avertion parameter tunning.

## Trading on the short-term : the use of signals 

### Price signals

We are going to plot some signals of our data to have a better idea. We will use the following signals:
1. Moving Average: 
   $ \text{SMA}(n) = \frac{\sum_{i=1}^n \text{Price}_i}{n}$
2. Relative Strength Index (RSI): 
   $ \text{RSI} = 100 - \left( \frac{100}{1 + \text{RS}} \right) $
   where $ \text{RS} = \frac{\text{Average Gain}}{\text{Average Loss}} $
3. Exponential Moving Average : 
   $\text{EMA}(n) = \alpha \cdot \text{Price}_{n} + (1 - \alpha) \cdot \text{EMA}_{n-1}$
   where $\alpha = \frac{2}{n + 1}$ is the smoothing factor and $n$ is the length of the EMA.
4. Moving Average Convergence Divergence (MACD): 
   $ \text{MACD} = \text{EMA}_{\text{fast}} - \text{EMA}_{\text{slow}} $
   and the signal line is a moving average of the MACD.

In [None]:


data['SMA_50'] = data.groupby('ticker')['last'].transform(lambda x: x.rolling(window=50).mean())
data['SMA_200'] = data.groupby('ticker')['last'].transform(lambda x: x.rolling(window=200).mean())

delta = data.groupby('ticker')['last'].transform(lambda x: x.diff())
gain = (delta.where(delta > 0, 0)).groupby(data['ticker']).transform(lambda x: x.rolling(window=15).mean())
loss = (-delta.where(delta < 0, 0)).groupby(data['ticker']).transform(lambda x: x.rolling(window=15).mean())
rs = gain / loss
data['RSI_15'] = 100 - (100 / (1 + rs))

# Calculate MACD
data['EMA_12'] = data.groupby('ticker')['last'].transform(lambda x: x.ewm(span=12, adjust=False).mean())
data['EMA_26'] = data.groupby('ticker')['last'].transform(lambda x: x.ewm(span=26, adjust=False).mean())
data['MACD'] = data['EMA_12'] - data['EMA_26']
data['MACD_signal'] = data.groupby('ticker')['MACD'].transform(lambda x: x.ewm(span=9, adjust=False).mean())


for sample_ticker in unique_tickers:
    sample_data = data[data['ticker'] == sample_ticker]

    fig, axes = plt.subplots(4, 1, figsize=(10, 15), sharex=True)

    axes[0].plot(sample_data['date'], sample_data['last'], label='Price')
    axes[0].plot(sample_data['date'], sample_data['SMA_50'], label='SMA 50')
    axes[0].plot(sample_data['date'], sample_data['SMA_200'], label='SMA 200')
    axes[0].set_title(f'{sample_ticker} - Price and Moving Averages')
    axes[0].legend()

    axes[1].plot(sample_data['date'], sample_data['EMA_12'], label='EMA 12')
    axes[1].plot(sample_data['date'], sample_data['EMA_26'], label='EMA 26')
    axes[1].set_title(f'{sample_ticker} - Exponential Moving Averages')
    axes[1].legend()
    
    axes[2].plot(sample_data['date'], sample_data['RSI_15'], label='RSI 15')
    axes[2].axhline(70, color='r', linestyle='--')
    axes[2].axhline(30, color='g', linestyle='--')
    axes[2].set_title(f'{sample_ticker} - RSI')
    axes[2].legend()

    axes[3].plot(sample_data['date'], sample_data['MACD'], label='MACD')
    axes[3].plot(sample_data['date'], sample_data['MACD_signal'], label='Signal Line')
    axes[3].set_title(f'{sample_ticker} - MACD')
    axes[3].legend()

    plt.tight_layout()
    plt.show()


### Signals with the volume


1. **Volume**:

2. **On-Balance Volume (OBV)**:
   $
   \text{OBV} = \text{OBV}_{\text{previous}} + \begin{cases}
     \text{Volume} & \text{if the price has increased} \\
     -\text{Volume} & \text{if the price has decreased} \\
     0 & \text{if the price is constant}
   \end{cases}
   $

3. **Weighted On-Balance Volume (WOBV)**:
   $
   \text{WOBV} = \text{WOBV}_{\text{previous}} + \text{Volume} \times \Delta\text{Price}
   $

4. **Percentage Change On-Balance Volume (PCOBV)**:
   $
   \text{PCOBV} = \text{PCOBV}_{\text{previous}} + \text{Volume} \times \frac{\Delta\text{Price}}{\text{Previous Price}}
   $

In [None]:
data['OBV'] = data.groupby('ticker')['volume'].transform(lambda x: (x * np.sign(x.diff())).cumsum())

data['Delta_Price'] = data.groupby('ticker')['last'].transform(lambda x: x.diff())
data['WOBV'] = data.groupby('ticker')['volume'].transform(lambda x: (x * data['Delta_Price']).cumsum())

data['Pct_Change_Price'] = data.groupby('ticker')['last'].transform(lambda x: x.pct_change())
data['PCOBV'] = data.groupby('ticker')['volume'].transform(lambda x: (x * data['Pct_Change_Price']).cumsum())

for sample_ticker in unique_tickers:
    sample_data = data[data['ticker'] == sample_ticker]

    fig, axes = plt.subplots(4, 1, figsize=(10, 20), sharex=True)

    axes[0].bar(sample_data['date'], sample_data['volume'], label='Volume')
    axes[0].set_title(f'{sample_ticker} - Volume')
    axes[0].legend()

    axes[1].plot(sample_data['date'], sample_data['OBV'], label='OBV')
    axes[1].set_title(f'{sample_ticker} - On-Balance Volume')
    axes[1].legend()

    axes[2].plot(sample_data['date'], sample_data['WOBV'], label='WOBV')
    axes[2].set_title(f'{sample_ticker} - Weighted On-Balance Volume')
    axes[2].legend()

    axes[3].plot(sample_data['date'], sample_data['PCOBV'], label='PCOBV')
    axes[3].set_title(f'{sample_ticker} - Percentage Change On-Balance Volume')
    axes[3].legend()

    plt.tight_layout()
    plt.show()

Too long to compute but we have an idea...


## Trend Confirmation Strategy 

Sure, here is the explanation in English for the "Trend Confirmation Strategy" using price and volume signals:

1. **Buy Signal:**
   - Short-term moving average (SMA_50) crosses above long-term moving average (SMA_200).
   - RSI is below 30 - classic oversold condition.
   - MACD is above the signal line.
   - Both OBV and WOBV are increasing and confirm the price trend.

2. **Sell Signal:**
   - Short-term moving average (SMA_50) crosses below long-term moving average (SMA_200).
   - RSI is above 70 - classic overbought condition.
   - MACD is below the signal line.
   - Both OBV and WOBV are decreasing and confirm the price trend.

This strategy combines the signals from both price and volume to make trading decisions. The buy and sell signals are generated based on the moving averages, RSI, and MACD for price, and OBV and WOBV for volume. The strategy is then backtested to calculate cumulative returns, and performance metrics such as total return, Sharpe ratio, and maximum drawdown are calculated to evaluate the strategy's performance.

In [None]:
data['Signal'] = 0
data['Position'] = 0

# Buy signals
data.loc[(data['SMA_50'] > data['SMA_200']) & (data['RSI_15'] < 30) & (data['MACD'] > data['MACD_signal']) & (data['OBV'] > data['OBV'].shift(1)) & (data['WOBV'] > data['WOBV'].shift(1)), 'Signal'] = 1

# Sell signals
data.loc[(data['SMA_50'] < data['SMA_200']) & (data['RSI_15'] > 70) & (data['MACD'] < data['MACD_signal']) & (data['OBV'] < data['OBV'].shift(1)) & (data['WOBV'] < data['WOBV'].shift(1)), 'Signal'] = -1

data['Position'] = data['Signal'].replace(to_replace=0, method='ffill')
data['Daily_Return'] = data.groupby('ticker')['last'].pct_change()
data['Strategy_Return'] = data['Position'].shift(1) * data['Daily_Return']

# Backtest the strategy
strategy_returns = data.groupby('ticker')['Strategy_Return'].cumsum()

for sample_ticker in unique_tickers:
    sample_data = data[data['ticker'] == sample_ticker]

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(sample_data['date'], sample_data['Strategy_Return'].cumsum())
    ax.set_title(f'{sample_ticker} - Strategy Returns')
    ax.set_xlabel('Date')
    ax.set_ylabel('Cumulative Returns')

    plt.tight_layout()
    plt.show()

total_return = data.groupby('ticker')['Strategy_Return'].sum()

sharpe_ratio = data.groupby('ticker')['Strategy_Return'].mean() / data.groupby('ticker')['Strategy_Return'].std() * np.sqrt(252)

drawdown = data.groupby('ticker').apply(lambda x: (x['Strategy_Return'].cumsum() - x['Strategy_Return'].cumsum().cummax()).min())

performance_metrics = pd.DataFrame({
    'Total Return': total_return,
    'Sharpe Ratio': sharpe_ratio,
    'Max Drawdown': drawdown
})

performance_metrics

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(10, 15))

axes[0].hist(performance_metrics['Total Return'], bins=30, color='skyblue', edgecolor='black')
axes[0].set_title('Distribution of Total Returns')
axes[0].set_xlabel('Total Return')
axes[0].set_ylabel('Frequency')

axes[1].hist(performance_metrics['Sharpe Ratio'], bins=30, color='lightgreen', edgecolor='black')
axes[1].set_title('Distribution of Sharpe Ratios')
axes[1].set_xlabel('Sharpe Ratio')
axes[1].set_ylabel('Frequency')

axes[2].hist(performance_metrics['Max Drawdown'], bins=30, color='salmon', edgecolor='black')
axes[2].set_title('Distribution of Maximum Drawdowns')
axes[2].set_xlabel('Max Drawdown')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [56]:
total_return.sum()

83.7716331178849

In [53]:
# Identify best and worst performing tickers
best_total_return = performance_metrics['Total Return'].idxmax()
worst_total_return = performance_metrics['Total Return'].idxmin()

best_sharpe_ratio = performance_metrics['Sharpe Ratio'].idxmax()
worst_sharpe_ratio = performance_metrics['Sharpe Ratio'].idxmin()

best_max_drawdown = performance_metrics['Max Drawdown'].idxmax()
worst_max_drawdown = performance_metrics['Max Drawdown'].idxmin()

best_worst_performance = pd.DataFrame({
    'Best Total Return': [best_total_return, performance_metrics.loc[best_total_return, 'Total Return']],
    'Worst Total Return': [worst_total_return, performance_metrics.loc[worst_total_return, 'Total Return']],
    'Best Sharpe Ratio': [best_sharpe_ratio, performance_metrics.loc[best_sharpe_ratio, 'Sharpe Ratio']],
    'Worst Sharpe Ratio': [worst_sharpe_ratio, performance_metrics.loc[worst_sharpe_ratio, 'Sharpe Ratio']],
    'Best Max Drawdown': [best_max_drawdown, performance_metrics.loc[best_max_drawdown, 'Max Drawdown']],
    'Worst Max Drawdown': [worst_max_drawdown, performance_metrics.loc[worst_max_drawdown, 'Max Drawdown']],
}, index=['Ticker', 'Value'])

best_worst_performance


Unnamed: 0,Best Total Return,Worst Total Return,Best Sharpe Ratio,Worst Sharpe Ratio,Best Max Drawdown,Worst Max Drawdown
Ticker,7951 JT,9766 JT,5019 JT,9434 JT,3864 JT,4506 JT
Value,2.757156,-0.991722,1.426093,-1.954547,0.0,-1.628885


The main drawback of our strategy is that we never use correlation, clustering, or lead-lag effect on our different assets. 

We also never parametrise our strategy using risk aversion, investment horizon and the quantity of money we want to invest but we have a solid base for a trading strategy.

The next step would be first to parametrize each of our univariate signals, find the best parametrisation using out of sample backtesting and than add a new term for each of our asset that is linked to the other to take into account the correlation between assets.
Further improvements could include the use of machine learning algorithms or other advanced techniques to optimize the parameters and improve the strategy's performance.