In [19]:
#conda activate AP1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from scipy.stats import skew, kurtosis

import pyfolio as pf
import empyrical as emp

In [20]:
df = pd.read_excel('data_nn.xlsx')
#df.to_pickle("data_nn.xlsx")

In [21]:
# Set the first column as the date index
df.set_index(df.columns[0], inplace=True)

# Convert the index to string and then to DatetimeIndex format
df.index = pd.to_datetime(df.index.astype(str))

# Filter the data for the last ten years
df_last_10_years = df.loc[df.index > "2020-01-02"]

# Apply rolling sum with a window of 252 and require at least 126 non-NaN values
df_rolling_sum = df_last_10_years.rolling(window=252, min_periods=int(252//2)).sum()

# Forward-fill NaN values, but limit this to a maximum of 5 consecutive fills
df_filled = df_last_10_years.ffill(limit=5)

# Drop any remaining NaN values that still exist after the forward-fill operation
df_cleaned = df_filled.dropna()

#return back original name to not interruppt code.
df_last_10_years = df_cleaned




In [75]:
def refactored_advanced_features(df_returns):
    """
    Refactored computation of advanced financial features to reduce DataFrame fragmentation.
    """
    skew = {}
    kurtosis = {}
    max_drawdown = {}
    volatility = {}
    vaR = {}
    momentum = {}
    avg_return = {}
    rsi = {}

        
        # 1. Skewness
    print("Skewness")
    for window in [20, 40, 60, 100, 180, 240, 360, 480]:
        skew[window] = df_returns.rolling(window).skew()

        # 2. Kurtosis
    print("Kurtosis")
    for window in [20, 40, 60, 100, 180, 240, 360, 480]:
        kurtosis[window]=df_returns.rolling(window).kurt()
    
    # 3. Maximum drawdown
    print("Maximum drawdown")
    for window in [20, 40, 60, 100, 180, 240, 360, 480]:
        max_drawdown[window] = df_returns.rolling(window).apply(emp.max_drawdown, raw=True)
    
    # 4. Volatility
    print("Volatility")
    for window in [20, 40, 60, 100, 180, 240, 360, 480]:
        volatility[window] = df_returns.rolling(window).std()*(252**0.5)
    
    # 5. Value at Risk
    print("Value at Risk")
    for window in [20, 40, 60, 100, 180, 240, 360, 480]:
        vaR[window] = df_returns.rolling(window).apply(emp.value_at_risk, raw=True)
    
    # 6. Momentum
    print("Momentum")
    for window in [20, 40, 60, 100, 180, 240, 360, 480]:
        momentum[window] = df_returns.rolling(window).sum() # ?

    print("Average Return")
    for window in [20, 40, 60, 100, 180, 240, 360, 480]:
        avg_return[window] = df_returns.rolling(window).mean()
    
    return skew, kurtosis, max_drawdown, volatility, vaR, momentum, avg_return

# This function reduces DataFrame fragmentation by constructing all columns and concatenating them at once.

# Läs tommys mex hur de gjorde reversal, sen implementera det. Fixa windows size till vad de hade i rapporten.
# skew[20].head() 

In [None]:
# Call the function and capture the output
skew, kurtosis, max_drawdown, volatility, vaR, momentum, avg_return = refactored_advanced_features(df_last_10_years)


In [107]:
# Reset the feature DataFrames list
features_df_list = []

# Create individual lists for each feature's DataFrame
skew_df_list = [] 
kurtosis_df_list = []
max_drawdown_df_list = []
volatility_df_list = []
vaR_df_list = []
momentum_df_list = []
avg_return_df_list = []

# Windows configuration
windows = [20, 40, 60, 100, 180, 240, 360, 480]

# Iterate through each feature dictionary and create a DataFrame
for feature_name, feature_dict in [('skew', skew), ('kurtosis', kurtosis), ('max_drawdown', max_drawdown), 
                                   ('volatility', volatility), ('vaR', vaR), ('momentum', momentum), ('avg_return', avg_return)]:
    # Only keep the windows that are present for each feature
    relevant_windows = windows if feature_name != 'kurtosis' else windows[:-1]
    feature_df = pd.concat({f'{feature_name}_{window}': feature_dict[window] for window in relevant_windows}, axis=1)
    
    # Append the individual DataFrame to the corresponding feature list
    if feature_name == 'skew':
        skew_df_list.append(feature_df)
    elif feature_name == 'kurtosis':
        kurtosis_df_list.append(feature_df)
    elif feature_name == 'max_drawdown':
        max_drawdown_df_list.append(feature_df)
    elif feature_name == 'volatility':
        volatility_df_list.append(feature_df)
    elif feature_name == 'vaR':
        vaR_df_list.append(feature_df)
    elif feature_name == 'momentum':
        momentum_df_list.append(feature_df)
    elif feature_name == 'avg_return':
        avg_return_df_list.append(feature_df)
    
    # Add the DataFrame to the main list
    features_df_list.append(feature_df)


# Concatenate all feature DataFrames into a single DataFrame
features_df = pd.concat(features_df_list, axis=1)

# Concatenate all feature DataFrames into a single DataFrame for each feature
if len(skew_df_list) > 1:
    skew_df = pd.concat(skew_df_list, axis=1)
if len(kurtosis_df_list) > 1:
    kurtosis_df = pd.concat(kurtosis_df_list, axis=1)
if len(max_drawdown_df_list) > 1:
    max_drawdown_df = pd.concat(max_drawdown_df_list, axis=1)
if len(volatility_df_list) > 1:
    volatility_df = pd.concat(volatility_df_list, axis=1)
if len(vaR_df_list) > 1:
    vaR_df = pd.concat(vaR_df_list, axis=1)
if len(momentum_df_list) > 1:
    momentum_df = pd.concat(momentum_df_list, axis=1)
if len(avg_return_df_list) > 1:
    avg_return_df = pd.concat(avg_return_df_list, axis=1)



# The individual lists for each feature now contain their respective DataFrames
# And features_df_list contains all the feature DataFrames
# Let's print the first item of each sublist to confirm
#print("Skew DataFrame:\n", skew_df_list[0].tail(), "\n")
#print("Kurtosis DataFrame:\n", kurtosis_df_list[0].tail(), "\n")
#print("Max Drawdown DataFrame:\n", max_drawdown_df_list[0].tail(), "\n")
#print("Volatility DataFrame:\n", volatility_df_list[0].tail(), "\n")
#print("VaR DataFrame:\n", vaR_df_list[0].tail(), "\n")
#print("Momentum DataFrame:\n", momentum_df_list[0].tail(), "\n")
#print("Average Return DataFrame:\n", avg_return_df_list[0].tail(), "\n")

# Print the last 5 rows of the combined DataFrame
features_df.tail()

Unnamed: 0_level_0,skew_20,skew_20,skew_20,skew_20,skew_20,skew_20,skew_20,skew_20,skew_20,skew_20,...,avg_return_480,avg_return_480,avg_return_480,avg_return_480,avg_return_480,avg_return_480,avg_return_480,avg_return_480,avg_return_480,avg_return_480
Unnamed: 0_level_1,Equities_0,Equities_1,Equities_2,Equities_3,Equities_4,Equities_5,Equities_6,Equities_7,Equities_8,Equities_9,...,Equity_Sector_1,Equity_Sector_2,Equity_Sector_3,Equity_Sector_4,Equity_Sector_5,Equity_Sector_6,Equity_Sector_7,Equity_Sector_8,Equity_Sector_9,Equity_Sector_10
Column1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2023-02-20,-0.240833,-0.198584,0.748803,-0.034554,0.183471,-0.216372,0.256436,-0.157259,1.524128,0.163826,...,0.000223,0.001289,9.3e-05,0.000232,8.7e-05,5e-06,1e-05,-8e-06,-0.00061,0.000152
2023-02-21,-0.086253,-0.0704,0.773535,0.011176,0.312154,-0.041571,0.221016,-0.162284,1.521845,-0.296421,...,0.000203,0.001335,8.9e-05,0.000194,6e-05,-2.6e-05,-4e-06,-7.4e-05,-0.000646,8.3e-05
2023-02-22,-0.128589,-0.062873,0.757504,-0.109515,0.415347,0.052482,0.191846,-0.47039,1.486008,-0.30171,...,0.000189,0.00129,5.6e-05,0.000162,2.6e-05,-5.1e-05,-2.9e-05,-0.000105,-0.000646,9.4e-05
2023-02-23,-0.023004,0.106261,0.83082,-0.006954,0.265489,0.092309,0.325509,-0.476984,1.454702,-0.274151,...,0.0002,0.001304,7.4e-05,0.000161,3e-05,-3e-05,2.3e-05,-0.000123,-0.000627,9.9e-05
2023-02-24,0.050596,0.138392,0.864532,0.148482,0.428164,0.081861,0.420396,-0.548086,1.506494,-0.256232,...,0.000202,0.001269,3.1e-05,0.000141,1e-06,-6.2e-05,2e-06,-0.000159,-0.000647,9.3e-05


In [59]:
def RSI(df_returns, window):
    """
    Computes the Relative Strength Index (RSI) for a given window.
    """
    df = df_returns.copy()
    df[df >= 0] = 1
    df[df < 0] = 0
    df = df.rolling(window).mean()*100
    return df

RSI skip for now

In [None]:
# Initialize an empty dictionary to store the last RSI value for each window
rsi_values = {}

# Calculate RSI for each window and store the last value
for window in [20, 40, 60, 100, 180, 240, 360, 480]:
    rsi_df = RSI(df_last_10_years, window)  # df_returns is your DataFrame with returns data
    last_rsi_value = rsi_df.iloc[-1]  # Get the last row of the RSI DataFrame
    rsi_values[window] = last_rsi_value  # Store it in the dictionary with the window as the key

# Print the last RSI value for a 20-day window
print("Last RSI value for 20-day window:")
print(rsi_values[20])




NN

In [109]:
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
# Define the neural network model to accommodate multiple output dimensions
class MultivariateNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultivariateNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)  # Output dimension should match the number of columns
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)


# Using actual columns from the dataset to populate 'dataframes_to_concat'
# As per the uploaded dataset, 'Equities_0', 'Equities_1', and 'Equities_2' are used
# Assume that calculated_features_df is a DataFrame containing all the calculated features
#calculated_features = skew, kurtosis, max_drawdown, volatility, vaR, momentum, avg_return
calculated_features = feature_df
calculated_features_df = pd.DataFrame(calculated_features)
dataframes_to_concat = [calculated_features_df]

# Concatenate feature data frames 
features_df = pd.concat(dataframes_to_concat, axis=1)
features_df.fillna(features_df.mean(), inplace=True)


# Split the data into training and testing sets (considering all columns for y)
X_train, X_test, y_train, y_test = train_test_split(features_df, features_df, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"Debug: NaN values in training data: {np.isnan(X_train_scaled).sum()}")

# Converting to PyTorch tensor
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values)  # Multiple columns are included
X_test_tensor = torch.FloatTensor(X_test_scaled)

# Initialize the neural network model
output_dim = y_train.shape[1]  # Number of columns to predict
model = MultivariateNN(X_train.shape[1], output_dim)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

initial_weights = model.fc1.weight.data.numpy()
print(f"Debug: Initial weights are {initial_weights}")

# Added check for empty list before DataFrame concatenation

# Check if dataframes_to_concat is empty before attempting concatenation
if len(dataframes_to_concat) == 0:
    print("Warning: No DataFrames to concatenate.")
else:
    features_df = pd.concat(dataframes_to_concat, axis=1)

# Placeholder for Backward Neural Network
class BackwardNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(BackwardNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)
        
# # Initialize the backward neural network model
# backward_model = BackwardNN(X_train.shape[1], output_dim)

# # Training loop for backward neural network
# for epoch in range(epochs):
#     backward_model.train()
#     optimizer.zero_grad()
#     backward_outputs = backward_model(X_train_tensor)
#     backward_loss = criterion(backward_outputs, y_train_tensor)
#     backward_loss.backward()
#     optimizer.step()

# After training
first_layer_weights = model.fc1.weight.data.numpy()
feature_importance = np.abs(first_layer_weights).sum(axis=0)
first_layer_weights = model.fc1.weight.data.numpy()
if first_layer_weights.size == 0:
    print("Debug: The first layer weights are empty.")
else:
    print(f"Debug: The first layer weights have a shape of {first_layer_weights.shape}.")

# Debug Statement  Calculate and display feature importance
feature_importance = np.abs(first_layer_weights).sum(axis=0)
if feature_importance.size == 0:
    print("Debug: Feature importance calculation resulted in an empty array.")
else:
    print(f"Debug: Feature importance is {feature_importance}.")



Debug: NaN values in training data: 0
Debug: Initial weights are [[ 0.03483968  0.11833545  0.13017921 ... -0.02285792 -0.02690527
  -0.03993764]
 [-0.04002718 -0.01287511 -0.04542283 ... -0.0979039  -0.05549259
  -0.05549062]
 [ 0.02469679 -0.020651    0.03333227 ...  0.09545812  0.05978915
   0.07503959]
 ...
 [ 0.03913424  0.07382268  0.08822093 ...  0.05865164  0.03893404
   0.07861988]
 [ 0.02321641  0.04794021  0.08012177 ... -0.08153882 -0.02693151
  -0.01002786]
 [ 0.01723923 -0.04752985 -0.07679463 ... -0.01781033  0.01685979
   0.04035555]]
Debug: The first layer weights have a shape of (128, 448).
Debug: Feature importance is [5.3988805 5.3309426 5.6671066 5.918927  5.0234222 5.3156934 5.3874207
 5.713137  5.850659  4.9522004 5.368966  5.773518  5.519747  5.651442
 5.654879  5.343331  5.6079288 5.901164  4.548122  5.2049303 5.325871
 5.373878  5.4856596 5.1113086 5.8119903 5.819521  6.0100727 6.410671
 5.7977223 5.6980147 5.6739025 5.7952547 6.125892  5.9839005 5.392599
 5.9

Risk budgeting benchmark

In [None]:
import cvxpy as cp
from tqdm import tqdm


# Assuming df_last_10_years contains the daily returns
returns = df_last_10_years.drop(columns=df_last_10_years.columns[0])

# Initialize list to store optimal weights and returns for each t
all_weights = []
all_returns = []

# Number of assets
n_assets = len(returns.columns)

# Risk budget for each asset
b = np.ones(n_assets) / n_assets  # For example, equal risk budgeting

# Arbitrary constant for the constraint
c = 1  # Example value, adjust as needed

# Start from the 21st observation
for t in tqdm(range(54,len(returns) - 1,5)):
    # Data up to time t
    data_t = returns.iloc[:t+1].astype("float16")
    print(data_t.index[-1])
    # Covariance matrix of the returns
    cov_matrix_values = data_t.cov().values
    cov_matrix_values = (cov_matrix_values + cov_matrix_values.T)/2

    #cov_matrix_values += np.eye(cov_matrix_values.shape[0],cov_matrix_values.shape[1])

    #if min(eigs) <= 0:
    #    print(t)
    # Portfolio weights as a variable (y)
    y = cp.Variable(shape=n_assets)
    # Objective function: Minimize the square root of the portfolio variance
    objective = cp.Minimize(cp.sqrt(cp.quad_form(y, cp.psd_wrap(cov_matrix_values))))

    # Alternative solutions

    #objective = cp.Minimize(cp.quad_form(y, cp.psd_wrap(cov_matrix_values)))

    #cov_matrix_values = np.array(cov_matrix_values)
    #cov_matrix_values += np.eye(cov_matrix_values.shape[0],cov_matrix_values.shape[1])
    #L = np.linalg.cholesky(cov_matrix_values)
    #objective = cp.Minimize(cp.norm(L@y,2))

    # Constraints:
    # 1. The weights must sum to 1 (full investment)
    # 2. The risk budgeting constraint must be satisfied
    # 3. The weights must be non-negative
    constraints = [
        #cp.sum(y) == 1,        # incompatible with contstraint below. Normalize afterwards
        cp.sum(cp.multiply(b, cp.log(y))) >= c,
        y >= 1e-5 #strict inequalities are not allowed
    ]

    # Formulate the optimization problem
    problem = cp.Problem(objective, constraints)

    # Solve the problem using a suitable solver
    problem.solve(solver=cp.SCS,qcp=True, eps = 1e-5, max_iters  = 100) #kan ange hur många försöja man vill att solve kan göra. Kolla upp det. "Maxiteration". googla på risk budgeting med cvxpy.
    # Store the optimal weights for time t
    optimal_weights = y.value
    #optimal_weights /= np.linalg.norm(optimal_weights,1)
    all_weights.append(optimal_weights)

    #print(optimal_weights)

    #print(type(all_weights))
    #print(type(returns.iloc[t+1].values))
    
    # Calculate and store the portfolio return for time t+1 using weights from time t
    next_return = np.dot(returns.iloc[t+1].values, optimal_weights)
    all_returns.append(next_return)

# Convert lists to arrays for further analysis if needed
#all_weights = np.array(all_weights)
#all_returns = np.array(all_returns)

# Print the first 5 values of all_returns
#print("First 5 values of all_returns:")
#print(all_returns[:5])