In [1]:
import pandas as pd
import yfinance as yf
import datetime
import time
import requests
import io
import random
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from math import sqrt
import  pylab as pl
import numpy as np
from datetime import timedelta
from datetime import datetime
from dateutil.relativedelta import relativedelta

## Download data
Historical returns from about 200 stocks from Nasdaq, chosen randomly

In [None]:
start = datetime.datetime(2010,1,1)
end = datetime.datetime(2020,12,1)

In [None]:
# download symbols of all nasdaq components
url="https://pkgstore.datahub.io/core/nasdaq-listings/nasdaq-listed_csv/data/7665719fb51081ba0bd834fde71ce822/nasdaq-listed_csv.csv"
s = requests.get(url).content
companies = pd.read_csv(io.StringIO(s.decode('utf-8')))
symbols = companies['Symbol'].tolist()

In [None]:
# randomly select 500 assets
random.seed(123)
Symbols = random.sample(symbols, 500)

In [None]:
# download stock prices
stock_final = pd.DataFrame()

for i in Symbols:  
    try:
        stock = []
        stock = yf.download(i,start=start, end=end, progress=False)
        
        if len(stock) == 0:
            None
        else:
            stock['Name']= i
            stock_final = stock_final.append(stock,sort=False)
    except Exception:
        None

In [None]:
# dataframe of closing prices 
close = stock_final[["Close", "Name"]]
close_wide = close.pivot_table(index="Date", columns='Name', values='Close')
stock_price = close_wide.dropna(axis = 1)

In [None]:
n_stocks = len(stock_price.columns)

In [None]:
# stock returns
returns = stock_price.pct_change().iloc[1:]
returns.head()

# summary statistics
# returns.describe()

#save returns as pickle
returns.to_pickle('returns.pkl')
#open returns file from pickle
#returns = pd.read_pickle('returns.pkl')

In [None]:
# training and test set 
cutoff = "2019-12-31"
ret_train = returns[returns.index <= cutoff]

In [9]:
# import tbills data from csv
tbills = pd.read_csv('DTB3.csv', index_col = 0)
# save tbills as pickle file
tbills.to_pickle('DTB3.pkl')
# open tbills file from pickle
# tbills = pd.read_pickle('DTB3.pkl')

## K-Means

The k-means algorithm divides a set of $N$ samples $X$ into $C$ disjoint clusters, each described by the mean $\mu_i$ 
of the samples in the cluster (the centroid). The K-means algorithm aims to choose centroids that minimise the within-cluster sum-of-squares:
$$
\sum_{i=0}^{n} \underset{\mu_i \in C}{min} (| x_i - \mu_i |)^2
$$

In [None]:
# annual mean returns and variances 
mean_ret = ret_train.mean() * 252
var_ret = ret_train.std() * sqrt(252)
rets_df = pd.concat([mean_ret, var_ret], axis = 1)
rets_df.columns = ["Returns","Variance"]

In [None]:
# select optimal number of clusters by minimizing SSE
X =  rets_df.values 
sse = []
for k in range(2,15):
    
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(X)
    
    sse.append(kmeans.inertia_) #SSE for each cluster
    
pl.plot(range(2,15), sse)
pl.title("Elbow Curve")
pl.xlabel('nr clusters')
pl.ylabel('SSE')
pl.axvline(x=5, c = "k", linestyle='dashed')
pl.show()

In [None]:
# fit k-means with 5 clusters
X = rets_df.values 
kmeans = KMeans(n_clusters = 5).fit(X)
centroids = kmeans.cluster_centers_

pl.scatter(X[:,0],X[:,1], c = kmeans.labels_, cmap = "rainbow")
pl.show()

In [None]:
# remove outlier (rerun previous chunk afterwards)
outlier = mean_ret.idxmax()

rets_df.drop(outlier, inplace = True)
returns.drop(outlier, 1, inplace = True)

In [None]:
# count number of elements in each cluster
cluster_idx = np.array(kmeans.labels_)
(unique, counts) = np.unique(idx, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

In [None]:
# get cluster number for each asset
asset = pd.DataFrame(rets_df.index)
cluster_list = pd.concat([asset, pd.DataFrame(cluster_idx)],axis = 1)
cluster_list.columns = ["Asset","Cluster"]

## Portfolio optimization

In [None]:
# test set for rolling optimization
ret_test = returns[returns.index > (datetime.strptime(cutoff, '%Y-%m-%d') -  relativedelta(years=1))]

In [None]:
# save returns series for each cluster in dictionary
ret_dict = {}
for i in unique: 
    lgc = cluster_list["Cluster"] == i
    ret_dict[i] = returns.loc[:,lgc.values]

In [None]:
print(ret_dict)

In [None]:
# compute equally weighted monthly portfolio returns on test set for each cluster