In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import time
import lightgbm as lgb

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, BaseCrossValidator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, classification_report

import yfinance as yf

#Laden der Daten und Vorbereiten für das maschinelle Lernen:
DATA_STORE = 'sp.h5'
with pd.HDFStore(DATA_STORE) as store:
    data = store.get('data_clean')

for index, row in data.iterrows():
    try:
        row.prices.date
    except:
        row.prices = row.prices.reset_index(level=['date'])

data.rename_axis('ticker').reset_index()

for index, row in data.iterrows():
    row.prices['ticker'] = index

data_flat = pd.DataFrame(columns=['ticker', 'date', 'open', 'high', 'low', 'close', 'adjusted_close', 'volume', 'weekly_return', 'rsi', 'bb_low', 'bb_mid', 'bb_upper', 'target'])
data_flat.set_index('ticker')

for index, row in data.iterrows():
    data_flat = pd.concat([data_flat, row.prices])

# Berechnung der wöchentlichen Renditen und Klassifizierung der Performance:
data_flat['weekly_return'] = data_flat['adjusted_close'].pct_change(1).shift(-1)

outperform_threshold = 0.015
underperform_threshold = -0.01

def classify_performance(weekly_return):
    if weekly_return > outperform_threshold:
        return 1
    elif weekly_return < underperform_threshold:
        return -1
    else:
        return 0

data_flat['target'] = data_flat['weekly_return'].apply(classify_performance)
data_flat = data_flat.dropna()

data_flat['date'] = pd.to_datetime(data_flat['date'])
data_flat = data_flat[data_flat['date'].dt.weekday == 4]

#CPWR komplett entfernen
data_flat = data_flat[data_flat['ticker'] != 'CPWR']

In [None]:
#Alternative Walk Forward Testen:
#Implementierung der Walk-Forward-Cross-Validation:
class WalkForwardCV(BaseCrossValidator):
    def __init__(self, n_splits=4, test_period_length=8, lookahead=1):
        self.n_splits = n_splits
        self.test_period_length = test_period_length
        self.lookahead = lookahead

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        for i in range(self.n_splits):
            train_start = 0
            train_end = i * self.test_period_length + (n_samples - self.test_period_length * (self.n_splits + self.lookahead - 1))
            test_start = train_end
            test_end = test_start + self.test_period_length
            yield np.arange(train_start, train_end), np.arange(test_start, test_end)

In [None]:
with pd.HDFStore(DATA_STORE) as store:
    monthly_constituents = store.get('monatliche_bestandteile')

#Hinzufügen insp500 Der RF kann auf daten trainieren, die nicht im sp500 waren, aber anschließend nur in die zu dem Zeitpunkt enthaltenen Werte investieren
def get_sp500_tickers_on_date(date, monthly_constituents):
    tickers_on_date = set()
    for idx, row in monthly_constituents.iterrows():
        if row['Date'] <= date:
            tickers_on_date = row['Constituents']
            break
    return tickers_on_date


#data_flat['in_sp500'] = [ticker in get_sp500_tickers_on_date(date, monthly_constituents) for date, ticker in zip(data_flat['date'], data_flat['ticker'])]

In [None]:
# param_grid = {
#     'lgbm__learning_rate': [0.01, 0.05, 0.1],
#     'lgbm__n_estimators': [50, 100, 200],
#     'lgbm__num_leaves': [16, 31, 64],
#     'lgbm__max_depth': [-1, 8, 16],
#     'lgbm__min_child_samples': [20, 50, 100],
#     'lgbm__subsample': [0.6, 0.8, 1.0],
#     'lgbm__colsample_bytree': [0.6, 0.8, 1.0],
#     'lgbm__reg_alpha': [0.0, 0.1, 0.5],
#     'lgbm__reg_lambda': [0.0, 0.1, 0.5]
# }


In [None]:
# Aufteilung der Daten in Trainings-, Validierungs- und Testsets
features = ['open', 'high', 'low', 'close', 'adjusted_close', 'volume', 'rsi', 'bb_low', 'bb_mid', 'bb_upper']
target = 'target'

# Anpassung der Zeitraum für Trainings- und Testphasen
n_splits = 10
train_period_length = 2  # 16 Wochen Trainingszeitraum
test_period_length = 3    # 16 Wochen Testzeitraum
lookahead = 1

cv = WalkForwardCV(n_splits=n_splits, test_period_length=test_period_length, lookahead=lookahead)

#Finanzkrise mit betrachten; Zeitraum ab 2010 definieren, da sich vielleicht Muster geändert haben


train_start_date = '2009-12-31'
train_end_date = '2010-12-31'
valid_end_date = '2011-12-31'
test_end_date = '2022-12-31'

test_start_date = '2012-01-01'

train_start_date = pd.Timestamp(train_start_date)
train_end_date = pd.Timestamp(train_end_date)
valid_end_date = pd.Timestamp(valid_end_date)
test_end_date = pd.Timestamp(test_end_date)

test_start_date = pd.Timestamp(test_start_date)


train_data = data_flat[data_flat['date'] <= train_end_date]
valid_data = data_flat[(data_flat['date'] > train_end_date) & (data_flat['date'] <= valid_end_date)]
test_data = data_flat[(data_flat['date'] > valid_end_date) & (data_flat['date'] >= test_start_date) & (data_flat['date'] <= test_end_date)]

X_train = train_data[features]


y_train = train_data[target]
X_valid = valid_data[features]


y_valid = valid_data[target]
X_test = test_data[features]

y_test = test_data[target]

# Erweiterte Hyperparameter-Optimierung
#class_weights = {-1: 1, 0: 1, 1: 1}  # Klasse 1 wird hier einfach gewichtet
#class_weights = [None, 'balanced', {0: 1, 1: 2}, {0: 1, 1: 3}]
class_weights = [None]# 'balanced', {0: 1, 1: 2}, {0: 1, 1: 3}]
pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy='mean')),    
    ('scaler', StandardScaler()),    
    ('lgbm', lgb.LGBMClassifier(random_state=42))#, class_weight=class_weights))
])


param_grid = {
    'lgbm__learning_rate': [0.01, 0.1],#, 0.1],
    'lgbm__n_estimators': [50, 100, 200],#[50, 100]#, 200],
    'lgbm__num_leaves': [16, 31, 64], #[16, 31],#, 64],    #[16, 31]
    'lgbm__max_depth': [-1, 8, 16],#, 16],
    'lgbm__min_child_samples': [20,50, 100],#,50, 100],
    'lgbm__subsample': [0.6, 0.8],#, 1.0],
    'lgbm__colsample_bytree': [0.6, 0.8],#, 1.0],
    'lgbm__reg_alpha': [0.0, 0.5],#, 0.5],
    'lgbm__reg_lambda': [0.0, 0.5],#, 0.5]
    'lgbm__class_weight': class_weights
}


# Verwendung von GridSearchCV für Hyperparameter-Optimierung mit Genauigkeit als Scoring-Methode
grid = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=cv, n_jobs=-1)

# Alternative Scoring-Methode: F1-Score (Macro) statt Genauigkeit
grid_f1 = GridSearchCV(pipeline, param_grid, scoring='f1_macro', cv=cv, n_jobs=-1)

# Messung der Laufzeit der Hyperparameter-Optimierung
start_time = time.time()
grid.fit(X_train, y_train)
grid_f1.fit(X_train, y_train)
end_time = time.time()

print("Laufzeit der Hyperparameter-Optimierung: {:.2f} Minuten".format((end_time - start_time) / 60))

# Ergebnisse der Walk-Forward-Cross-Validation
cv_results = pd.DataFrame(grid.cv_results_)

# Ergebnisse für jeden Split anzeigen
for i in range(n_splits):
    split_test_score = f'split{i}_test_score'
    print(f"Testergebnisse für Split {i + 1}:")
    print(cv_results[[split_test_score]].sort_values(by=split_test_score, ascending=False))
    print("\n")


# Ausgabe der besten Hyperparameter und Modell-Performance
print("Beste Hyperparameter: ", grid.best_params_)
print("Beste Modellgenauigkeit: {:.4f}".format(grid.best_score_))
print("Beste F1-Score (Macro) (Alternative): {:.4f}".format(grid_f1.best_score_))

# Testen des besten Modells auf den Validierungsdaten
best_model = grid_f1.best_estimator_
y_valid_pred = best_model.predict(X_valid)

# Ausgabe der Leistungsmetricken
print("Genauigkeit: ", accuracy_score(y_valid, y_valid_pred))
print("Klassifikationsbericht: ")
print(classification_report(y_valid, y_valid_pred))

# Testen des besten Modells auf den Testdaten
y_test_pred = best_model.predict(X_test)

# Ausgabe der Leistungsmetriken für das Testset
print("Test-Genauigkeit: ", accuracy_score(y_test, y_test_pred))
print("Test-Klassifikationsbericht: ")
print(classification_report(y_test, y_test_pred))

In [None]:
# Visualisierung der Feature-Wichtigkeit
feature_importance = best_model.named_steps['lgbm'].feature_importances_
sorted_idx = np.argsort(feature_importance)

plt.figure(figsize=(12, 6))
plt.barh(range(X_train.shape[1]), feature_importance[sorted_idx])
plt.yticks(range(X_train.shape[1]), X_train.columns[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Feature Importance für das LightGBM Model')
plt.show()

# Visualisierung der wöchentlichen Renditen und Vorhersagen
test_data['predicted'] = y_test_pred

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(test_data['date'], test_data['weekly_return'], label='True Returns')
ax.scatter(test_data['date'][test_data['predicted'] == 1], test_data['weekly_return'][test_data['predicted'] == 1], color='g', label='Outperform Prediction')
ax.scatter(test_data['date'][test_data['predicted'] == -1], test_data['weekly_return'][test_data['predicted'] == -1], color='r', label='Underperform Prediction')
ax.set_xlabel('Datum')
ax.set_ylabel('Weekly Return')
ax.legend()
plt.title('Weekly Returns und Model Predictions')
plt.show()

In [None]:
#Backtest auf Validierungs- und Testset

validation_accuracy = accuracy_score(y_valid, y_valid_pred)

print("Validierungsgenauigkeit: {:.2f}%".format(validation_accuracy * 100))

print("\nKlassifikationsbericht für Validierung:")
print(classification_report(y_valid, y_valid_pred))

# Erstellen von Maske, um den unbekannten Zeitraum im Testset zu extrahieren; eigtl. unnötig da testzeitraum definiert
mask = test_data['date'] > valid_end_date # + pd.DateOffset(weeks=1)
unknown_test_data = test_data[mask]
unknown_X_test = unknown_test_data[features]
unknown_y_test = unknown_test_data[target]

# Vorhersagen auf dem unbekannten Zeitraum im Testset
y_test_pred = grid.predict(unknown_X_test)

test_accuracy = accuracy_score(unknown_y_test, y_test_pred)
print("Testgenauigkeit: {:.2f}%".format(test_accuracy * 100))

print("\nKlassifikationsbericht für Test:")
print(classification_report(unknown_y_test, y_test_pred))


In [None]:
import matplotlib.pyplot as plt

# Der beste LGBM, der während der Hyperparameter-Optimierung gefunden wurde
best_lgbm = grid.best_estimator_.named_steps['lgbm']

# Berechnung der Feature Importance
feature_importances = best_lgbm.feature_importances_

# Erstellen eines DataFrame mit Feature Importance und den Feature-Namen
importance_df = pd.DataFrame({'importance': feature_importances, 'feature': features})
importance_df = importance_df.sort_values(by='importance', ascending=True)

# Visualisieren der Feature Importance in einem horizontalen Balkendiagramm
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'], align='center')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()


In [None]:
#Werte normalisiert für feature importance
import matplotlib.pyplot as plt
import pandas as pd

# Der beste LGBM, der während der Hyperparameter-Optimierung gefunden wurde
best_lgbm = grid.best_estimator_.named_steps['lgbm']

# Berechnung der Feature Importance
feature_importances = best_lgbm.feature_importances_

# Normalisierung der Feature Importance, sodass sie sich zu 1 addieren
normalized_importances = feature_importances / feature_importances.sum()

# Erstellen eines DataFrame mit normalisierter Feature Importance und den Feature-Namen
importance_df = pd.DataFrame({'importance': normalized_importances, 'feature': features})
importance_df = importance_df.sort_values(by='importance', ascending=True)

# Visualisieren der normalisierten Feature Importance in einem horizontalen Balkendiagramm
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'], align='center')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('LightGBM Feature Importance')
plt.show()


In [None]:
DATA_STORE = 'sp.h5'
with pd.HDFStore(DATA_STORE) as store:
    monthly_constituents_df = store.get('monthly_constituents')

In [None]:
def get_sp500_tickers_on_date(date, monthly_constituents):
    tickers_on_date = set()
    for idx, row in monthly_constituents.iterrows():
        if row['Date'] <= pd.Timestamp(date):

            tickers_on_date = row['Constituents']
    return tickers_on_date


In [None]:
def count_signals(y_test_pred):
    buy_signals = 0
    sell_signals = 0

    for signal in y_test_pred:
        if signal == 1:
            buy_signals += 1
        elif signal == -1:
            sell_signals += 1

    return buy_signals, sell_signals

buy_signals, sell_signals = count_signals(y_test_pred)

print(f"Anzahl der Kaufsignale: {buy_signals}")
print(f"Anzahl der Verkaufssignale: {sell_signals}")


# Erstelle einen DataFrame mit Datum und Vorhersagen
signal_data = pd.DataFrame({'date': unknown_test_data['date'], 'signal': y_test_pred})

# Gruppiere Daten nach Datum und zähle Signale
signal_counts = signal_data.groupby(['date', 'signal']).size().unstack(fill_value=0).reset_index()
signal_counts.columns = ['date', 'sell', 'hold', 'buy']

# Plotten der Signale im Zeitverlauf
plt.figure(figsize=(12, 6))
plt.plot(signal_counts['date'], signal_counts['buy'], label='Kaufsignale')
plt.plot(signal_counts['date'], signal_counts['sell'], label='Verkaufsignale')
plt.xlabel('Datum')
plt.ylabel('Anzahl der Signale')
plt.title('Anzahl der Kauf- und Verkaufssignale im Zeitverlauf')
plt.legend()
plt.show()


In [None]:
#backtest
initial_cash = 10000000
max_stock_share = 0.05
price_data = data_flat.pivot_table(values='adjusted_close', index='date', columns='ticker')

portfolio = pd.DataFrame()
portfolio['date'] = test_data['date'].unique()
portfolio.set_index('date', inplace=True)
portfolio['cash'] = 0
portfolio['total_value'] = 0
portfolio.sort_index(inplace=True)

# Anfangswert von 'cash' und 'total_value' festlegen
first_date = portfolio.index[0]
portfolio.at[first_date, 'cash'] = initial_cash
portfolio.at[first_date, 'total_value'] = initial_cash

unknown_test_data = unknown_test_data.sort_values(by=['date', 'ticker']).reset_index(drop=True)

stock_positions = {}
trades = pd.DataFrame(columns=['date', 'ticker', 'shares', 'action', 'price'])
is_initial = True

for idx, row in unknown_test_data.iterrows():
    stock = row['ticker']
    date = row['date']
    signal = y_test_pred[idx]
    cash = portfolio.loc[date, 'cash']
    stock_value = row['adjusted_close']

    if np.isnan(price_data.loc[date, stock]):
        print(f"Missing price data für {stock} an {date}")

    if is_initial:
        portfolio.loc[date, 'cash'] = initial_cash
        portfolio.loc[date, 'total_value'] = initial_cash
        is_initial = False

    elif signal == 1 and cash > stock_value * max_stock_share and stock in get_sp500_tickers_on_date(date, monthly_constituents):

        shares_to_buy = (cash * max_stock_share) // stock_value
        cost = shares_to_buy * stock_value

        portfolio.loc[date, 'cash'] -= cost
        if stock not in stock_positions:
            stock_positions[stock] = 0
        stock_positions[stock] += shares_to_buy

        trades = trades.append({'date': date, 'ticker': stock, 'shares': shares_to_buy, 'action': 'buy', 'price': stock_value}, ignore_index=True)
        

    elif signal == -1 and stock in stock_positions and stock_positions[stock] > 0: #Alternativ auch bei Signal 0 verkaufen
        shares_to_sell = stock_positions[stock]
        revenue = shares_to_sell * stock_value

        portfolio.loc[date, 'cash'] += revenue
        stock_positions[stock] = 0

        trades = trades.append({'date': date, 'ticker': stock, 'shares': shares_to_sell, 'action': 'sell', 'price': stock_value}, ignore_index=True)

    total_stock_value = sum([stock_positions[ticker] * price_data.loc[date, ticker] for ticker in stock_positions if not np.isnan(price_data.loc[date, ticker])])
    portfolio.loc[date, 'total_value'] = portfolio.loc[date, 'cash'] + total_stock_value

    next_date = unknown_test_data.loc[idx + 1, 'date'] if idx + 1 < len(unknown_test_data) else None
    if next_date is not None and next_date != date:
        portfolio.loc[next_date, 'cash'] = portfolio.loc[date, 'cash']
        portfolio.loc[next_date, 'total_value'] = portfolio.loc[date, 'total_value']

start_date = test_data['date'].min().strftime('%Y-%m-%d')
end_date = test_data['date'].max().strftime('%Y-%m-%d')

sp500_data = yf.download('^GSPC', start=start_date, end=end_date, progress=False)['Adj Close']
sp500_data = sp500_data.pct_change().dropna()
sp500_data = (sp500_data + 1).cumprod() * initial_cash

plt.figure(figsize=(12, 6))
plt.plot(portfolio.index, portfolio['total_value'], label='Portfolio')
plt.plot(sp500_data.index, sp500_data, label='S&P 500')
plt.xlabel('Datum')
plt.ylabel('Wert')
plt.title('Portfolio Performance vs. S&P 500')
plt.legend()
plt.show()

In [None]:
#Performance in %
# Gewinne in Prozent
portfolio['percentage_gain'] = (portfolio['total_value'] / initial_cash - 1) * 100
sp500_data_percentage = (sp500_data / initial_cash - 1) * 100


plt.figure(figsize=(12, 6))
plt.plot(portfolio.index, portfolio['percentage_gain'], label='Portfolio')
plt.plot(sp500_data.index, sp500_data_percentage, label='S&P 500')
plt.xlabel('Datum')
plt.ylabel('Gewinn in %')
plt.title('Portfolio Performance vs. S&P 500')
plt.legend()
plt.show()


In [None]:
# Outperformance gegenüber S&P500 in Prozent berechnen
portfolio['outperformance'] = portfolio['percentage_gain'] - sp500_data_percentage

# Diagramm 
plt.figure(figsize=(12, 6))
plt.plot(portfolio.index, portfolio['outperformance'], label='Outperformance')
plt.xlabel('Datum')
plt.ylabel('Outperformance in %')
plt.title('Outperformance des LightGBM Portfolios gegenüber dem S&P 500')
plt.legend()
plt.axhline(y=0, color='r', linestyle='--', alpha=0.5)  # Fügt eine horizontale Linie bei 0% hinzu
plt.show()


In [None]:
import numpy as np

portfolio['weekly_returns'] = portfolio['total_value'].pct_change()
sp500_daily_returns = sp500_data.pct_change()

# Sharpe-Ratio berechnen
risk_free_rate = 0.02  # Annahme eines risikofreien Zinssatzes von 2%
portfolio_excess_returns = portfolio['weekly_returns'] - risk_free_rate / 252
sp500_excess_returns = sp500_daily_returns - risk_free_rate / 252

portfolio_sharpe_ratio = np.sqrt(252) * portfolio_excess_returns.mean() / portfolio_excess_returns.std()
sp500_sharpe_ratio = np.sqrt(252) * sp500_excess_returns.mean() / sp500_excess_returns.std()

# Diagramm 
plt.figure(figsize=(8, 6))
plt.bar(['Portfolio', 'S&P 500'], [portfolio_sharpe_ratio, sp500_sharpe_ratio])
plt.xlabel('Investment')
plt.ylabel('Sharpe-Ratio')
plt.title('Sharpe-Ratio des Portfolios und des S&P 500')
plt.show()


In [None]:
# Berechnung des Betas über den Zeitraum der Investition (gewichtet)
portfolio['weighted_beta'] = 0

# Änderung: Verwenden von unknown_test_data statt data_flat
beta_data = unknown_test_data.pivot_table(values='betas', index='date', columns='ticker')

# Filtern der Beta-Daten entsprechend dem Zeitraum des Backtests
beta_data = beta_data.loc[portfolio.index]

# Fehlende Beta-Daten als Spalte in unknown_test_data hinzufügen
unknown_test_data['beta'] = 0.0
for idx, row in unknown_test_data.iterrows():
    date = row['date']
    ticker = row['ticker']
    beta_value = beta_data.loc[date, ticker]
    if not np.isnan(beta_value):
        unknown_test_data.at[idx, 'beta'] = beta_value
    else:
        print(f"Missing beta data for {ticker} on {date}")

for date in portfolio.index:
    
    positions_on_date = {ticker: shares for ticker, shares in stock_positions.items() if shares > 0}
    total_stock_value_on_date = sum([shares * price_data.loc[date, ticker] for ticker, shares in positions_on_date.items() if not np.isnan(price_data.loc[date, ticker])])

    if total_stock_value_on_date > 0:
        weighted_beta_on_date = sum([(shares * price_data.loc[date, ticker] / total_stock_value_on_date) * unknown_test_data.loc[(unknown_test_data['date'] == date) & (unknown_test_data['ticker'] == ticker), 'beta'].values[0] if len(unknown_test_data.loc[(unknown_test_data['date'] == date) & (unknown_test_data['ticker'] == ticker), 'beta'].values) > 0 else 0 for ticker, shares in positions_on_date.items()])
    else:
        weighted_beta_on_date = 0

    portfolio.loc[date, 'weighted_beta'] = weighted_beta_on_date

# Plotten von Verlauf des gewichteten Betas während der Investition
plt.figure(figsize=(12, 6))
plt.plot(portfolio.index, portfolio['weighted_beta'])
plt.xlabel('Date')
plt.ylabel('Gewichtetes Beta')
plt.title('Gewichtetes Beta im Zeitverlauf')
plt.show()

# Berechnen des gesamten gewichteten Beta über den gesamten Zeitraum
total_weighted_beta = portfolio['weighted_beta'].mean()
print(f"Gewichtetes Beta über gesamten Zeitraum der Investition: {total_weighted_beta:.2f}")


In [None]:
# Berechnen Renditen des Portfolios und des S&P 500
portfolio_returns = portfolio['total_value'].pct_change().dropna()
sp500_returns = sp500_data.pct_change().dropna()

# Berechnen durchschnittliche Renditen des Portfolios und des S&P 500
average_portfolio_return = portfolio_returns.mean()
average_sp500_return = sp500_returns.mean()

# Alpha des Portfolios berechnen
# Festlegen, der risikofreien Rendite
risk_free_rate = 0.02
alpha = average_portfolio_return - (risk_free_rate + total_weighted_beta * (average_sp500_return - risk_free_rate))

print(f"Alpha des Portfolios: {alpha:.4f}")
