In [None]:
import pandas as pd
import yfinance as yf
import datetime
import time
import requests
import io
import random
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from math import sqrt
import  pylab as pl
import numpy as np
from datetime import timedelta
from datetime import datetime
from dateutil.relativedelta import relativedelta
from pypfopt import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns

In [None]:
def cluster_returns(x):
    # save returns for each cluster in dictionary
    # takes as input a dataframe containing the assets and corresponding cluster index
    # returns a dictionary containing the returns for each cluster
    ret_dict = {}
    for i in unique: 
        lgc = x["Cluster"] == i
        ret_dict[i] = ret_test.loc[:,lgc.values]
    
    return(ret_dict)

In [None]:
def equal_weights(x):
    # compute equally weighted portfolio returns for each cluster
    # takes a dictionary containing the cluster returns as input (output of cluster_returns)
    # returns a dataframe containing the equally weighted portfolio returns
    ret_ew = pd.DataFrame()

    for i in x.keys(): 
        ret_ew = ret_ew.append(x[i].mean(axis = 1), ignore_index = True)

    ret_ew = ret_ew.T
    
    return(ret_ew)

In [None]:
def rolling_portfolio(x):
    # computes optimal portfolio weights on 252 days rolling windows (daily portfolio rebalancing)
    # takes dataframe of cluster returns as input (output of equal_weights)
    # returns dataframe of optimal portfolio weights for each cluster
    window = 252

    pesi_df = pd.DataFrame()

    dates_dict = {}

    for i in range( len(x) - window - 1 ):

        # expected returns and sample covariance 
        rets_rolling = x.iloc[i:(i+window),:] 
        cluster_price = expected_returns.prices_from_returns(rets_rolling)
        mu = expected_returns.mean_historical_return(cluster_price)
        S = risk_models.sample_cov(cluster_price)

        # maximum sharpe ratio (tangency) portfolio weights 
        ef = EfficientFrontier(mu, S)
        raw_weights = ef.max_sharpe()
        kmeans_weights = ef.clean_weights()
        pesi_df = pesi_df.append( pd.DataFrame(kmeans_weights.values()).T )
        dates_dict[i] = rets_rolling.index[-1] 

    pesi_df.index = pd.Index(dates_dict.values())
    
    return(pesi_df)

In [None]:
def static_portfolio(x):
    # performs static portfolio optimization
    # takes dataframe of cluster returns as input (output of equal_weights)
    # returns dictionary of optimal portfolio weights for each cluster

    cluster_price = expected_returns.prices_from_returns(x)
    mu = expected_returns.mean_historical_return(cluster_price)
    S = risk_models.sample_cov(cluster_price)

    ef = EfficientFrontier(mu, S)
    raw_weights = ef.max_sharpe()
    weights = ef.clean_weights()
    
    return(weights)

## Download data
Historical returns from about 200 stocks from Nasdaq, chosen randomly

In [None]:
start = datetime(2010,1,1)
end = datetime(2020,12,1)

In [None]:
# download symbols of all nasdaq components
url="https://pkgstore.datahub.io/core/nasdaq-listings/nasdaq-listed_csv/data/7665719fb51081ba0bd834fde71ce822/nasdaq-listed_csv.csv"
s = requests.get(url).content
companies = pd.read_csv(io.StringIO(s.decode('utf-8')))
symbols = companies['Symbol'].tolist()

In [None]:
# randomly select 500 assets
random.seed(123)
Symbols = random.sample(symbols, 500)

In [None]:
# download stock prices
stock_final = pd.DataFrame()

for i in Symbols:  
    try:
        stock = []
        stock = yf.download(i,start=start, end=end, progress=False)
        
        if len(stock) == 0:
            None
        else:
            stock['Name']= i
            stock_final = stock_final.append(stock,sort=False)
    except Exception:
        None

In [None]:
# dataframe of closing prices 
close = stock_final[["Close", "Name"]]
close_wide = close.pivot_table(index="Date", columns='Name', values='Close')
stock_price = close_wide.dropna(axis = 1)

In [None]:
n_stocks = len(stock_price.columns)

In [None]:
# stock returns
returns = stock_price.pct_change().iloc[1:]
returns.head()

In [None]:
# import returns
returns = pd.read_pickle('returns.pkl')

In [None]:
# import 3 month tbill
tbills = pd.read_pickle('DTB3.pkl')

# annualized tbill 
annualized = []

for i in tbills['DTB3']:
    try:
       annualized.append((1 + float(i))**(1/252) - 1)
    except ValueError:
       annualized.append(0.0)

tbills['DTB3A'] = annualized

tbills.index = pd.to_datetime(tbills.index)
tbills.index.names = ['Date']
merged = pd.merge(tbills, returns, on='Date')
tbill_ann = pd.DataFrame( merged["DTB3A"] )

In [None]:
# training and test set 
cutoff = "2019-12-31"

ret_train = returns[returns.index <= cutoff]

ret_test = returns[returns.index > (datetime.strptime(cutoff, '%Y-%m-%d') -  relativedelta(years=1))]

## K-Means

The k-means algorithm divides a set of $N$ samples $X$ into $C$ disjoint clusters, each described by the mean $\mu_i$ 
of the samples in the cluster (the centroid). The K-means algorithm aims to choose centroids that minimise the within-cluster sum-of-squares:
$$
\sum_{i=0}^{n} \underset{\mu_i \in C}{min} (| x_i - \mu_i |)^2
$$

In [None]:
# annual mean returns and variances 
mean_ret = ret_train.mean() * 252
var_ret = ret_train.std() * sqrt(252)
rets_df = pd.concat([mean_ret, var_ret], axis = 1)
rets_df.columns = ["Returns","Variance"]

In [None]:
# select optimal number of clusters by minimizing SSE
X =  rets_df.values 
sse = []

random.seed(123)
for k in range(2,15):
    
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(X)
    
    sse.append(kmeans.inertia_) #SSE for each cluster
    
pl.plot(range(2,15), sse)
pl.title("Elbow Curve")
pl.xlabel('nr clusters')
pl.ylabel('SSE')
pl.axvline(x=5, c = "k", linestyle='dashed')
pl.show()

In [None]:
# fit k-means with 5 clusters
X = rets_df.values 
n_clusters = 5
kmeans = KMeans(n_clusters = n_clusters).fit(X)
centroids = kmeans.cluster_centers_

pl.scatter(X[:,0],X[:,1], c = kmeans.labels_, cmap = "rainbow")
pl.show()

In [None]:
# remove outlier (rerun previous chunk afterwards)
outlier = mean_ret.idxmax()

rets_df.drop(outlier, inplace = True)
returns.drop(outlier, 1, inplace = True)

In [None]:
# count number of elements in each cluster
cluster_idx = np.array(kmeans.labels_)
(unique, counts) = np.unique(cluster_idx, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

In [None]:
# get cluster number for each asset
asset = pd.DataFrame(rets_df.index)
cluster_list = pd.concat([asset, pd.DataFrame(cluster_idx)],axis = 1)
cluster_list.columns = ["Asset","Cluster"]

## Rolling portfolio optimization K-Means

In [None]:
# save return series for each cluster in dictionary
ret_dict = cluster_returns(cluster_list)

In [None]:
# compute equally weighted portfolio returns for each cluster
ret_ew = equal_weights(ret_dict)

In [None]:
# optimal portfolio weights on 252 days rolling windows (daily portfolio rebalancing)
pesi_rol = rolling_portfolio(ret_ew)

In [None]:
# make sure return dates match weight dates 
lgc = ret_ew.index.isin(pesi_rol.index)
ret_subs = ret_ew.iloc[lgc, :]

In [None]:
# portfolio returns
weighted_rets = ret_subs*pesi_rol
rets_rol = weighted_rets.sum(axis = 1)

# cumulative returns
kmeans_rol = 100*( (rets_rol + 1).cumprod() - 1 )

In [None]:
fig = plt.figure(figsize=(15,7))
ax1 = fig.add_axes([0.1,0.1,0.8,0.8])
ax1.plot(kmeans_rol)
ax1.set_xlabel('Date')
ax1.set_ylabel("%")
ax1.set_title("Portfolio Cumulative Returns")
plt.show();

## Static portfolio optimization K-Means

In [None]:
# static portfolio optimization
pesi_static = static_portfolio(ret_ew)

In [None]:
# portfolio returns
weighted_rets = ret_ew*list(pesi_static.values())
rets_static = weighted_rets.sum(axis = 1)

# make sure dates match in rolling and static implementation
lgc = rets_static.index.isin(rets_rol.index)
ret_fin = rets_static.iloc[lgc]

# cumulative returns
kmeans_static = 100*( (ret_fin + 1).cumprod() - 1 )

In [None]:
fig = plt.figure(figsize=(15,7))
ax1 = fig.add_axes([0.1,0.1,0.8,0.8])
ax1.plot(kmeans_static)
ax1.set_xlabel('Date')
ax1.set_ylabel("%")
ax1.set_title("Portfolio Cumulative Returns")
plt.show();

In [None]:
# # calculate portfolio weights for each asset
# kmeans_w = pd.DataFrame()

# names_assets = pd.Index([])

# for i in pesi_static.keys():
    
#     w = pesi_static[i]
#     pesi = [w*1/len(ret_dict[i].columns)]*len(ret_dict[i].columns)
#     kmeans_w = kmeans_w.append( pd.DataFrame(pesi) )
#     names_assets = names_assets.union( ret_dict[i].columns )

# kmeans_w.index = names_assets

In [None]:
# excess return from daily rebalancing
extraret = kmeans_rol - kmeans_static

fig = plt.figure(figsize=(15,7))
ax1 = fig.add_axes([0.1,0.1,0.8,0.8])
ax1.plot(extraret)
ax1.set_xlabel('Date')
ax1.set_ylabel("%")
ax1.set_title("Excess Return")
plt.show();