In [126]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import pandas_datareader as pdr
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, roc_curve, auc, RocCurveDisplay, classification_report
from sklearn.model_selection import train_test_split
from sklearn import svm
from xgboost import XGBClassifier

import backtrader as bt
import joblib
import yfinance as yf

from sklearn.metrics import roc_auc_score



### Task 1:

In [4]:
class CustomCSVData(bt.feeds.GenericCSVData):
    # Define the parameters
    params = (
        ('datetime', 0),    # index of datetime column
        ('open', 1),        # index of open price column
        ('high', 2),        # index of high price column
        ('low', 3),         # index of low price column
        ('close', 4),       # index of close price column
        ('volume', 5),      # index of volume column
        ('openinterest', -1),# index of open interest column
        ('time', -1),       # additional parameter for time
        ('dtformat', '%Y-%m-%d'), # format of datetime
        ('timeframe', bt.TimeFrame.Days), # default timeframe
    )

    # Initialization method
    def __init__(self):
        super(CustomCSVData, self).__init__()

    # Class method for stock data
    @classmethod
    def aapl_reader(cls):
        cls.params.dtformat = '%Y-%m-%d'
        cls.params.datetime = 0
        cls.params.open = 1
        cls.params.high = 2
        cls.params.low = 3
        cls.params.close = 4
        cls.params.volume = 5
        cls.params.openinterest = -1    
        return cls

    # Class method for ERCOTDA data
    @classmethod
    def ercotda_reader(cls):
        cls.params.dtformat = '%Y-%m-%d'
        cls.params.timeframe = bt.TimeFrame.Minutes
        cls.params.datetime = 0
        cls.params.time = 1
        cls.params.close = 2
        
        # Set other columns to -1 as they are not relevant
        cls.params.open = cls.params.high = cls.params.low = cls.params.volume = cls.params.openinterest = -1
        return cls

    # Class method for XSHE data
    @classmethod
    def xshe_reader(cls):
        cls.params.dtformat = '%Y-%m-%d'
        cls.params.datetime = 0
        cls.params.open = 1
        cls.params.close = 2
        cls.params.high = 3
        cls.params.low = 4
        cls.params.volume = 5
        cls.params.money = 6
        cls.params.avg = 7
        cls.params.openinterest = -1
        
        return cls


In [99]:
# Function to handle missing data and normalize
def preprocess_data(file_path, fill_method='ffill', scaler_type='standard'):
    # Load data
    data = pd.read_csv(file_path)
    
    # Filling missing values
    if fill_method == 'ffill':
        data.fillna(method='ffill', inplace=True)  # Forward fill
    elif fill_method == 'bfill':
        data.fillna(method='bfill', inplace=True)  # Backward fill
    else:
        raise ValueError("fill_method must be 'ffill' or 'bfill'")
    
    # Removing outliers - simple method by excluding any data beyond 3 standard deviations
    data = data[(np.abs(stats.zscore(data.select_dtypes(include=[np.number]))) < 3).all(axis=1)]
    
    # Normalizing data
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("scaler_type must be 'standard' or 'minmax'")
    
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
    
    return data


In [101]:
# Function to add features to the DataFrame (as defined previously)
def add_features(df):
    df['daily_return'] = df['Close'].pct_change() * 100
    for window in [10, 20, 50, 200]:
        df[f'moving_avg_{window}'] = df['Close'].rolling(window=window).mean()
    df['ema_20'] = df['Close'].ewm(span=20, adjust=False).mean()
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi_14'] = 100 - (100 / (1 + rs))
    exp1 = df['Close'].ewm(span=12, adjust=False).mean()
    exp2 = df['Close'].ewm(span=26, adjust=False).mean()
    df['macd'] = exp1 - exp2
    df['signal_line'] = df['macd'].ewm(span=9, adjust=False).mean()
    df.dropna(inplace=True)
    return df

### Task 2:

In [28]:

def process(data, fill_method, scaler_type):
    # Fill missing values
    if fill_method == 'forward':
        data.fillna(method='ffill', inplace=True)
    elif fill_method == 'backward':
        data.fillna(method='bfill', inplace=True)


    # Removing outliers
    numeric_cols = data.select_dtypes(include=['number'])
    data = data[(np.abs(stats.zscore(numeric_cols)) < 3).all(axis=1)]

    # Normalizing data
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:
        raise ValueError("Invalid scaler type. Choose 'standard' or 'minmax'.")

    data[numeric_cols.columns] = scaler.fit_transform(data[numeric_cols.columns])

    # Return processed data
    return data


### Task 3:

In [85]:
tickers_df = pd.read_csv('tickers.csv')

# Extract the list of tickers
tickers = tickers_df['Ticker'].tolist()

# Define the start and end dates for the data collection
start_date = '2000-01-01'
end_date = '2021-11-12'

# Create a dictionary to store the DataFrame for each ticker
stock_dataframes = {}

# Fetch and store the data for each ticker in its own DataFrame
for ticker in tickers:
    df = yf.download(ticker, start=start_date, end=end_date)
    stock_dataframes[ticker] = df

# Now, each ticker's data is stored in its own DataFrame within the stock_dataframes dictionary.

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [80]:
def new_features(df):
    df['daily_return'] = df['Close'].pct_change() * 100
    for window in [10, 20, 50, 200]:
        df[f'moving_avg_{window}'] = df['Close'].rolling(window=window).mean()
    df['ema_20'] = df['Close'].ewm(span=20, adjust=False).mean()
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi_14'] = 100 - (100 / (1 + rs))
    exp1 = df['Close'].ewm(span=12, adjust=False).mean()
    exp2 = df['Close'].ewm(span=26, adjust=False).mean()
    df['macd'] = exp1 - exp2
    df['signal_line'] = df['macd'].ewm(span=9, adjust=False).mean()
    df.dropna(inplace=True)
    return df

In [109]:
# Concatenate all DataFrames in the dictionary
all_tickers_df = pd.concat(stock_dataframes.values())
all_tickers_df.reset_index(drop=False, inplace=True)

# Ensure that 'Date' column is a datetime type
all_tickers_df['Date'] = pd.to_datetime(all_tickers_df['Date'])
# Set 'Date' as the index of the DataFrame
all_tickers_df.set_index('Date', inplace=True)
# Apply the feature engineering function to the concatenated DataFrame
all_tickers_df = new_features(all_tickers_df)
all_tickers_df['target'] = (all_tickers_df['daily_return'] > 0).astype(int)

In [107]:
display(all_tickers_df)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,daily_return,moving_avg_10,moving_avg_20,moving_avg_50,moving_avg_200,ema_20,rsi_14,macd,signal_line,target
199,2000-10-16,2.937500,3.125000,2.937500,3.000000,2.421956,43200,2.127660,3.61875,4.384375,4.60625,5.942500,4.077878,5.128205,-0.523280,-0.322303,1
200,2000-10-17,3.062500,3.125000,3.000000,3.125000,2.522873,17500,4.166667,3.48125,4.275000,4.58625,5.922813,3.987128,10.000000,-0.531362,-0.364115,1
201,2000-10-18,3.125000,3.125000,2.937500,2.937500,2.371500,34500,-6.000000,3.34375,4.156250,4.56625,5.902812,3.887163,7.142857,-0.546597,-0.400612,0
202,2000-10-19,2.937500,3.000000,2.875000,3.000000,2.421956,15000,2.127660,3.23125,4.053125,4.55000,5.882812,3.802671,9.302326,-0.547318,-0.429953,1
203,2000-10-20,2.937500,3.000000,2.937500,3.000000,2.421956,8300,0.000000,3.15000,3.950000,4.53375,5.862813,3.726227,10.526316,-0.541646,-0.452291,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48716,2021-11-05,49.900002,50.279999,48.860001,50.200001,44.331879,8701700,1.393663,48.27800,48.375500,48.68980,50.455550,48.428039,62.396679,0.173834,-0.044430,1
48717,2021-11-08,50.279999,50.980000,49.720001,49.900002,44.066948,6720900,-0.597608,48.34600,48.499500,48.71820,50.465950,48.568226,63.179935,0.266625,0.017781,0
48718,2021-11-09,49.950001,50.070000,49.020000,49.099998,43.360466,4719200,-1.603212,48.42500,48.583500,48.72760,50.465500,48.618871,54.677198,0.272467,0.068718,0
48719,2021-11-10,49.169998,49.740002,48.880001,48.970001,43.245659,4537400,-0.264760,48.61000,48.669000,48.69200,50.454450,48.652312,48.780501,0.263570,0.107688,0


### Task 4 & 5:

In [96]:
# Prepare the features and target for model training
X = all_tickers_df[['moving_avg_10', 'moving_avg_20', 'moving_avg_50', 'ema_20', 'rsi_14', 'macd', 'signal_line']].values
y = (all_tickers_df['daily_return'] > 0).astype(int).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
#standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)

joblib.dump(scaler, 'combined_scaler.pkl')
joblib.dump(xgb_model, 'combined_xgb_model.pkl')


xgb_predictions = xgb_model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, xgb_predictions)
print(f"Model Accuracy: {accuracy:.4f}")

# Calculate precision
precision = precision_score(y_test, xgb_predictions)
print(f"Model Precision: {precision:.4f}")

# Make predictions on the test set
xgb_predictions = xgb_model.predict(X_test_scaled)

# Calculate the probabilities for the positive class
xgb_probs = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Calculate ROC curve and ROC AUC
fpr, tpr, thresholds = roc_curve(y_test, xgb_probs)
auc = roc_auc_score(y_test, xgb_probs)

print(f"Model AUC: {auc:.4f}")



Model Accuracy: 0.5913
Model Precision: 0.5901
Model AUC: 0.6241


In [97]:
# Load the scaler and model for demonstration
scaler = joblib.load('combined_scaler.pkl')
xgb_model = joblib.load('combined_xgb_model.pkl')

# Define the Backtrader strategy using the trained XGBoost model
class XGBStrategy_combined(bt.Strategy):
    params = (('model', xgb_model), ('scaler', scaler),)
    
    def __init__(self):
        # Define indicators as part of the strategy
        self.sma10 = bt.indicators.SimpleMovingAverage(self.datas[0], period=10)
        self.sma20 = bt.indicators.SimpleMovingAverage(self.datas[0], period=20)
        self.sma50 = bt.indicators.SimpleMovingAverage(self.datas[0], period=50)
        self.ema20 = bt.indicators.ExponentialMovingAverage(self.datas[0], period=20)
        self.rsi = bt.indicators.RelativeStrengthIndex(self.datas[0], period=14)
        self.macd = bt.indicators.MACD(self.datas[0])
        self.signal_line = self.macd.signal

        # To keep track of pending orders
        self.order = None

    def next(self):
        # Skip the first lookback period
        if len(self) < 50:
            return

        # Prepare the features for the current bar in the same way we did for training
        features = np.array([
            (self.datas[0].close[0] - self.sma10[0]) / self.sma10[0],
            (self.datas[0].close[0] - self.sma20[0]) / self.sma20[0],
            (self.datas[0].close[0] - self.sma50[0]) / self.sma50[0],
            (self.datas[0].close[0] - self.ema20[0]) / self.ema20[0],
            self.rsi[0],
            self.macd.macd[0],  # MACD line value
            self.macd.signal[0],  # Signal line value
        ]).reshape(1, -1)  # Reshape for a single sample

        # Scale the features
        features_scaled = self.params.scaler.transform(features)

        # Make the prediction
        prediction = self.params.model.predict(features_scaled)

        # Execute orders based on the prediction
        if prediction > 0.5 and not self.position:
            self.buy()
        elif prediction < 0.5 and self.position:

            self.sell()

In [129]:
%matplotlib inline
# Instantiate Cerebro engine
cerebro = bt.Cerebro()

# Create a data feed from the pandas dataframe and add it to Cerebro
data_feed = bt.feeds.PandasData(dataname=all_tickers_df)
cerebro.adddata(data_feed)

# Add the strategy to Cerebro
cerebro.addstrategy(XGBStrategy_combined)

# Set the initial cash
initial_cash = 100000.0
cerebro.broker.setcash(initial_cash)

# Set the commission if any
cerebro.broker.setcommission(commission=0.001)
plt.rcParams['figure.figsize'] = [20, 10]
cerebro.addsizer(bt.sizers.FixedSize, stake=1000)

# Add analyzers for Sharpe Ratio and Drawdown
cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='sharpe_ratio')
cerebro.addanalyzer(bt.analyzers.DrawDown, _name='draw_down')


results = cerebro.run()
cerebro.plot()

# Retrieve the analyzers
sharpe_ratio_analyzer = results[0].analyzers.sharpe_ratio.get_analysis()
drawdown_analyzer = results[0].analyzers.draw_down.get_analysis()

final_value = cerebro.broker.getvalue()
# Calculate the net return
net_return = (final_value - initial_cash) / initial_cash * 100

# Print the results
print(f'Initial Portfolio Value: ${initial_cash:,.2f}')
print(f'Final Portfolio Value: ${final_value:,.2f}')
print(f'Net Return: {net_return:.2f}%')

# Print the results
print(f"Sharpe Ratio: {sharpe_ratio_analyzer['sharperatio']}")
print(f"Max Drawdown: {drawdown_analyzer['max']['drawdown']}%")
print(f"Max Drawdown Length: {drawdown_analyzer['max']['len']}")

<IPython.core.display.Javascript object>

Initial Portfolio Value: $100,000.00
Final Portfolio Value: $25,880.25
Net Return: -74.12%
Sharpe Ratio: -0.3026688973279419
Max Drawdown: 84.7900587639801%
Max Drawdown Length: 23521


In [None]:
import pandas as pd
import os

In [124]:
data_dir = './stock_dfs/'
stock_data = {}

for file_name in os.listdir(data_dir):
    if file_name.endswith('.csv'):
        ticker = file_name.split('.')[0]
        # df = pd.read_csv(os.path.join(data_dir, file_name))
        print(os.path.join(data_dir, file_name))
        stock_data[ticker] = preprocess_data(os.path.join(data_dir, file_name), 'bfill', 'standard')

def add_features(df):
    df['daily_return'] = df['close'].pct_change() * 100
    for window in [10, 20, 50, 200]:
        df[f'moving_avg_{window}'] = df['close'].rolling(window=window).mean()
    df['ema_20'] = df['close'].ewm(span=20, adjust=False).mean()
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi_14'] = 100 - (100 / (1 + rs))
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = exp1 - exp2
    df['signal_line'] = df['macd'].ewm(span=9, adjust=False).mean()
    df.dropna(inplace=True)
    return df

for ticker, df in stock_data.items():
    stock_data[ticker] = add_features(df)


./stock_dfs/DAL.csv
./stock_dfs/DE.csv
./stock_dfs/DFS.csv
./stock_dfs/DG.csv
./stock_dfs/DGX.csv
./stock_dfs/DHI.csv
./stock_dfs/DHR.csv
./stock_dfs/DIS.csv
./stock_dfs/DISCA.csv
./stock_dfs/DISCK.csv
./stock_dfs/DISH.csv
./stock_dfs/DLR.csv
./stock_dfs/DLTR.csv
./stock_dfs/DOV.csv
./stock_dfs/DRE.csv
./stock_dfs/DRI.csv
./stock_dfs/DTE.csv
./stock_dfs/DUK.csv
./stock_dfs/DVA.csv
./stock_dfs/DVN.csv
./stock_dfs/DWDP.csv
./stock_dfs/DXC.csv
./stock_dfs/EA.csv
./stock_dfs/EBAY.csv
./stock_dfs/ECL.csv
./stock_dfs/ED.csv
./stock_dfs/EFX.csv
./stock_dfs/EIX.csv
./stock_dfs/EL.csv
./stock_dfs/EMN.csv
./stock_dfs/EMR.csv
./stock_dfs/EOG.csv
./stock_dfs/EQIX.csv
./stock_dfs/EQR.csv
./stock_dfs/EQT.csv
./stock_dfs/ES.csv
./stock_dfs/ESRX.csv
./stock_dfs/ESS.csv
./stock_dfs/ETFC.csv
./stock_dfs/ETN.csv
./stock_dfs/ETR.csv
./stock_dfs/EVRG.csv
./stock_dfs/EW.csv
./stock_dfs/EXC.csv
./stock_dfs/EXPD.csv
./stock_dfs/EXPE.csv
./stock_dfs/EXR.csv
./stock_dfs/F.csv
./stock_dfs/FAST.csv
./stock_dfs/FB

In [114]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.metrics import precision_score
import backtrader as bt
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.metrics import accuracy_score, precision_score, roc_curve, auc, RocCurveDisplay


sharpe_ratios = {} 
draw_down_ratios = {}

for ticker, df in stock_data.items():
    print(ticker)
    def evaluate_model_performance(y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        return accuracy, precision

    def evaluate_performance(y_test, predictions, model_name):
        # Calculate accuracy and precision
        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions)
        # Calculate the ROC curve and AUC
        fpr, tpr, _ = roc_curve(y_test, predictions)
        roc_auc = auc(fpr, tpr)
        # Display the ROC curve
        # RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=model_name).plot()
        # Print the evaluation metrics
        # print(f"{model_name} Accuracy: {accuracy:.4f}")
        # print(f"{model_name} Precision: {precision:.4f}")
        # print(f"{model_name} ROC AUC: {roc_auc:.4f}")

    df['date'] = pd.to_datetime(df['date'])
    # Set 'Date' as the index of the DataFrame
    df.set_index('date', inplace=True)
    # stock_data['EBAY']
    df.sort_index(inplace=True)
    # Create a binary classification target: 1 if the price will rise the next day, 0 otherwise
    df['target'] = (df['daily_return'] > 0).astype(int)
    # Select features and the target
    X = df[['moving_avg_10', 'moving_avg_20', 'moving_avg_50', 'ema_20', 'rsi_14', 'macd', 'signal_line']]
    y = df['target']
    # Split the data into a training set and a testing set
    
    
    X_train, X_test, y_train, y_test

    min_samples_for_split = 42  # You can adjust this threshold as needed

    if len(X) >= min_samples_for_split:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
    else:
        # Handle the case where there's not enough data
        # You might choose to skip this dataset or adjust the test size
        print(f"Not enough data for ticker {ticker}. Only {len(X)} samples available.")
        continue  # Skip to the next iteration of the loop

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    # Train an SVM model
    svm_model = svm.SVC()
    svm_model.fit(X_train_scaled, y_train)
    # Make predictions with the SVM model
    svm_predictions = svm_model.predict(X_test_scaled)
    # Evaluate the SVM model
    svm_accuracy = accuracy_score(y_test, svm_predictions)
    svm_class_report = classification_report(y_test, svm_predictions)
    # Assuming y_test and predictions are already defined
    svm_precision = precision_score(y_test, svm_predictions)
    # print(f"SVM Accuracy: {svm_accuracy:.4f}, SVM Precision: {svm_precision:.4f}")
    # Evaluate SVM performance
    evaluate_performance(y_test, svm_predictions, 'SVM')
    # Here we prepare the features (X) and the target (y) for model training
    X = df[['moving_avg_10', 'moving_avg_20', 'moving_avg_50', 'ema_20', 'rsi_14', 'macd', 'signal_line']].values
    y = (df['daily_return'] > 0).astype(int).values  # Target: 1 if return > 0, else 0
    # Split the data into training and testing sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    # Train the SVM model
    svm_model = svm.SVC()
    svm_model.fit(X_train_scaled, y_train)
    # Save the scaler and model (do this in your local environment)
    joblib.dump(scaler, 'a_scaler.pkl')
    joblib.dump(svm_model, 'a_model.pkl')
    # Load the scaler and model (for demonstration; in practice, use the saved files)
    scaler = joblib.load('a_scaler.pkl')
    svm_model = joblib.load('a_model.pkl')
    # Define the Backtrader strategy using the trained SVM model
    class svm_a(bt.Strategy):
        params = (('model', svm_model), ('scaler', scaler),)
        
        def __init__(self):
            # Define indicators as part of the strategy
            self.sma10 = bt.indicators.SimpleMovingAverage(self.datas[0], period=10)
            self.sma20 = bt.indicators.SimpleMovingAverage(self.datas[0], period=20)
            self.sma50 = bt.indicators.SimpleMovingAverage(self.datas[0], period=50)
            self.ema20 = bt.indicators.ExponentialMovingAverage(self.datas[0], period=20)
            self.rsi = bt.indicators.RelativeStrengthIndex(self.datas[0], period=14)
            self.macd = bt.indicators.MACD(self.datas[0])
            self.signal_line = self.macd.signal

            # To keep track of pending orders
            self.order = None

        def next(self):
            # Skip the first lookback period
            if len(self) < 50:
                return

            # Prepare the features for the current bar in the same way we did for training
            features = np.array([
                (self.datas[0].close[0] - self.sma10[0]) / self.sma10[0],
                (self.datas[0].close[0] - self.sma20[0]) / self.sma20[0],
                (self.datas[0].close[0] - self.sma50[0]) / self.sma50[0],
                (self.datas[0].close[0] - self.ema20[0]) / self.ema20[0],
                self.rsi[0],
                self.macd.macd[0],  # MACD line value
                self.macd.signal[0],  # Signal line value
            ]).reshape(1, -1)  # Reshape for a single sample

            # Scale the features
            features_scaled = self.params.scaler.transform(features)

            # Make the prediction
            prediction = self.params.model.predict(features_scaled)

            # Execute orders based on the prediction
            if prediction > 0.5 and not self.position:
                self.buy()
            elif prediction < 0.5 and self.position:
                self.sell()



    # Instantiate Cerebro engine
    cerebro = bt.Cerebro()
    # Create a data feed from the pandas dataframe and add it to Cerebro
    data_feed = bt.feeds.PandasData(dataname=df)
    cerebro.adddata(data_feed)
    # Add the strategy to Cerebro
    cerebro.addstrategy(svm_a)
    # Set the initial cash
    initial_cash = 100000.0
    cerebro.broker.setcash(initial_cash)
    # Set the commission if any
    cerebro.broker.setcommission(commission=0.001)
    plt.rcParams['figure.figsize'] = [20, 10]
    cerebro.addsizer(bt.sizers.FixedSize, stake=1000)
    # Add analyzers for Sharpe Ratio and Drawdown
    cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='sharpe_ratio')
    cerebro.addanalyzer(bt.analyzers.DrawDown, _name='draw_down')
    # Run the strategy
    cerebro.run()
    # cerebro.plot()
    results = cerebro.run()
    # Retrieve the analyzers
    sharpe_ratio_analyzer = results[0].analyzers.sharpe_ratio.get_analysis()
    drawdown_analyzer = results[0].analyzers.draw_down.get_analysis()
    final_value = cerebro.broker.getvalue()
    # Calculate the net return
    net_return = (final_value - initial_cash) / initial_cash * 100



    # Store the Sharpe Ratio in the dictionary
    sharpe_ratios[ticker] = sharpe_ratio_analyzer['sharperatio']
    draw_down_ratios[ticker] = drawdown_analyzer['max']['drawdown']
    

    # # Print the results
    # print(f'Initial Portfolio Value: ${initial_cash:,.2f}')
    # print(f'Final Portfolio Value: ${final_value:,.2f}')
    # print(f'Net Return: {net_return:.2f}%')
    # # Print the results
    # print(f"Sharpe Ratio: {sharpe_ratio_analyzer['sharperatio']}")
    # print(f"Max Drawdown: {drawdown_analyzer['max']['drawdown']}%")
    # print(f"Max Drawdown Length: {drawdown_analyzer['max']['len']}")

DAL
DE
DFS
DG
DGX
DHI
DHR
DIS
DISCA
DISCK
DISH
DLR
DLTR
DOV
DRE
DRI
DTE
DUK
DVA
DVN
DWDP
DXC
EA
EBAY
ECL
ED
EFX
EIX
EL
EMN
EMR
EOG
EQIX
EQR
EQT
ES
ESRX
ESS
ETFC
ETN
ETR
EVRG
Not enough data for ticker EVRG. Only 0 samples available.
EW
EXC
EXPD
EXPE
EXR
F
FAST
FB
FBHS
FCX
FDX
FE
FFIV
FIS
FISV
FITB
FL
FLIR
FLR
FLS
FLT
FMC
FOX
FOXA
FRT
FTI
FTNT
FTV
GD
GE
GILD
GIS
GLW
GM
GOOG
GOOGL
GPC
GPN
GPS
GRMN
GS
GT
GWW
HAL
HAS
HBAN
HBI
HCA
HCP
HD
HES
HFC
HIG
HII
HLT
HOG
HOLX
HON
HP
HPE
HPQ
HRB
HRL
HRS
HSIC
HST
HSY
HUM
IBM
ICE
IDXX
IFF
ILMN
INCY
INFO
INTC
INTU
IP
IPG
IPGP
IQV
IR
IRM
ISRG
IT
ITW
IVZ
JBHT
JCI
JEC
JEF
JNJ
JNPR
JPM
JWN
K
KEY
KHC
KIM
KLAC
KMB
KMI
KMX
KO
KORS
KR
KSS
KSU
L
LB
LEG
LEN
LH
LIN
Not enough data for ticker LIN. Only 0 samples available.
LKQ
LLL
LLY
LMT
LNC
LNT
LOW
LRCX
LUV
LYB
M
MA
MAA
MAC
MAR
MAS
MAT
MCD
MCHP
MCK
MCO
MDLZ
MDT
MET
MGM
MHK
MKC
MLM
MMC
MMM
MNST
MO
MOS
MPC
MRK
MRO
MS
MSCI
MSFT
MSI
MTB
MTD
MU
MYL
NBL
NCLH
NDAQ
NEE
NEM
NFLX
NFX
NI
NKE
NKTR
NLSN
NOC
NOV
NRG
NSC
NTAP
NT

In [115]:
# Filter out entries where Sharpe Ratio is None
filtered_sharpe_ratios = {ticker: ratio for ticker, ratio in sharpe_ratios.items() if ratio is not None}

# Sort the filtered dictionary by Sharpe Ratio values in descending order
sorted_sharpe_ratios = sorted(filtered_sharpe_ratios.items(), key=lambda x: x[1], reverse=True)

# Get the top 10 stocks with the highest Sharpe Ratios
top_10_sharpe_ratios = sorted_sharpe_ratios[:10]

# Print the top 10 stocks with their Sharpe Ratios
print("Top 10 Stocks by Sharpe Ratio:")
for ticker, sharpe_ratio in top_10_sharpe_ratios:
    print(f"Ticker: {ticker}, Sharpe Ratio: {sharpe_ratio}")


Top 10 Stocks by Sharpe Ratio:
Ticker: SYF, Sharpe Ratio: 0.2041691287526498
Ticker: HSY, Sharpe Ratio: 0.009250157970300545
Ticker: ORCL, Sharpe Ratio: -0.06709399721612418
Ticker: ILMN, Sharpe Ratio: -0.1028269086941033
Ticker: PX, Sharpe Ratio: -0.17024492465525812
Ticker: GM, Sharpe Ratio: -0.17743098680666766
Ticker: PG, Sharpe Ratio: -0.19477687720277076
Ticker: UAL, Sharpe Ratio: -0.2089494859093226
Ticker: VFC, Sharpe Ratio: -0.21358750998461273
Ticker: VZ, Sharpe Ratio: -0.2855441966175649


In [116]:
#Sort the filtered dictionary by Max Draw Down values in descending order
sorted_dd_ratios = sorted(draw_down_ratios.items(), key=lambda x: x[1], reverse=True)

# Get the top 10 stocks with the highest Sharpe Ratios
top_10_dd_ratios = sorted_dd_ratios[:10]

# Print the top 10 stocks with their Sharpe Ratios
print("Top 10 Stocks by Max Draw Down Ratio:")
for ticker, dd_ratio in top_10_dd_ratios:
    print(f"Ticker: {ticker}, Sharpe Ratio: {dd_ratio}")

Top 10 Stocks by Max Draw Down Ratio:
Ticker: UHS, Sharpe Ratio: 6.506938820833079
Ticker: F, Sharpe Ratio: 6.20161359117406
Ticker: PSA, Sharpe Ratio: 5.537210135763775
Ticker: SBUX, Sharpe Ratio: 5.3708881968896485
Ticker: OMC, Sharpe Ratio: 4.679513882816147
Ticker: FL, Sharpe Ratio: 4.661836753518724
Ticker: NLSN, Sharpe Ratio: 4.659732765558306
Ticker: IPG, Sharpe Ratio: 4.6516896486503265
Ticker: PPL, Sharpe Ratio: 4.60958215006288
Ticker: WHR, Sharpe Ratio: 4.542365759624178


In [125]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.metrics import precision_score
import backtrader as bt
import pandas as pd
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.metrics import accuracy_score, precision_score, roc_curve, auc, RocCurveDisplay


sharpe_ratios = {} 
draw_down_ratios = {}

for ticker, df in stock_data.items():
    print(ticker)
    def evaluate_model_performance(y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        return accuracy, precision

    def evaluate_performance(y_test, predictions, model_name):
        # Calculate accuracy and precision
        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions)
        # Calculate the ROC curve and AUC
        fpr, tpr, _ = roc_curve(y_test, predictions)
        roc_auc = auc(fpr, tpr)
        # Display the ROC curve
        # RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=model_name).plot()
        # Print the evaluation metrics
        # print(f"{model_name} Accuracy: {accuracy:.4f}")
        # print(f"{model_name} Precision: {precision:.4f}")
        # print(f"{model_name} ROC AUC: {roc_auc:.4f}")

    df['date'] = pd.to_datetime(df['date'])
    # Set 'Date' as the index of the DataFrame
    df.set_index('date', inplace=True)
    # stock_data['EBAY']
    df.sort_index(inplace=True)
    # Create a binary classification target: 1 if the price will rise the next day, 0 otherwise
    df['target'] = (df['daily_return'] > 0).astype(int)
    # Select features and the target
    X = df[['moving_avg_10', 'moving_avg_20', 'moving_avg_50', 'ema_20', 'rsi_14', 'macd', 'signal_line']]
    y = df['target']
    # Split the data into a training set and a testing set
    
    
    X_train, X_test, y_train, y_test

    min_samples_for_split = 42  # You can adjust this threshold as needed

    if len(X) >= min_samples_for_split:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
    else:
        # Handle the case where there's not enough data
        # You might choose to skip this dataset or adjust the test size
        print(f"Not enough data for ticker {ticker}. Only {len(X)} samples available.")
        continue  # Skip to the next iteration of the loop

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    # Train the XGBoost model
    xgb_model = XGBClassifier()
    xgb_model.fit(X_train_scaled, y_train)

    # Make predictions with the XGBoost model
    xgb_predictions = xgb_model.predict(X_test_scaled)
    # Evaluate the XGBoost model
    xgb_accuracy = accuracy_score(y_test, xgb_predictions)
    xgb_precision = precision_score(y_test, xgb_predictions)
   
    # Evaluate XGBoost performance
    evaluate_performance(y_test, xgb_predictions, 'XGBoost')

    # Here we prepare the features (X) and the target (y) for model training
    X = df[['moving_avg_10', 'moving_avg_20', 'moving_avg_50', 'ema_20', 'rsi_14', 'macd', 'signal_line']].values
    y = (df['daily_return'] > 0).astype(int).values  # Target: 1 if return > 0, else 0
    # Split the data into training and testing sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Save the scaler and XGBoost model
    joblib.dump(scaler, 'xgb_scaler.pkl')
    joblib.dump(xgb_model, 'xgb_model.pkl')
    # Load the scaler and model for the strategy
    scaler = joblib.load('xgb_scaler.pkl')
    xgb_model = joblib.load('xgb_model.pkl')
    # Define the Backtrader strategy using the trained SVM model
    class XGBStrategy(bt.Strategy):
        params = (('model', xgb_model), ('scaler', scaler),)
        
        def __init__(self):
            # Define indicators as part of the strategy
            self.sma10 = bt.indicators.SimpleMovingAverage(self.datas[0], period=10)
            self.sma20 = bt.indicators.SimpleMovingAverage(self.datas[0], period=20)
            self.sma50 = bt.indicators.SimpleMovingAverage(self.datas[0], period=50)
            self.ema20 = bt.indicators.ExponentialMovingAverage(self.datas[0], period=20)
            self.rsi = bt.indicators.RelativeStrengthIndex(self.datas[0], period=14)
            self.macd = bt.indicators.MACD(self.datas[0])
            self.signal_line = self.macd.signal

            # To keep track of pending orders
            self.order = None

        def next(self):
            # Skip the first lookback period
            if len(self) < 50:
                return

            # Prepare the features for the current bar in the same way we did for training
            features = np.array([
                (self.datas[0].close[0] - self.sma10[0]) / self.sma10[0],
                (self.datas[0].close[0] - self.sma20[0]) / self.sma20[0],
                (self.datas[0].close[0] - self.sma50[0]) / self.sma50[0],
                (self.datas[0].close[0] - self.ema20[0]) / self.ema20[0],
                self.rsi[0],
                self.macd.macd[0],  # MACD line value
                self.macd.signal[0],  # Signal line value
            ]).reshape(1, -1)  # Reshape for a single sample

            # Scale the features
            features_scaled = self.params.scaler.transform(features)

            # Make the prediction
            prediction = self.params.model.predict(features_scaled)

            # Execute orders based on the prediction
            if prediction > 0.5 and not self.position:
                self.buy()
            elif prediction < 0.5 and self.position:
                self.sell()



    # Instantiate Cerebro engine
    cerebro = bt.Cerebro()
    # Create a data feed from the pandas dataframe and add it to Cerebro
    data_feed = bt.feeds.PandasData(dataname=df)
    cerebro.adddata(data_feed)
    # Add the strategy to Cerebro
    cerebro.addstrategy(XGBStrategy)
    # Set the initial cash
    initial_cash = 100000.0
    cerebro.broker.setcash(initial_cash)
    # Set the commission if any
    cerebro.broker.setcommission(commission=0.001)
    plt.rcParams['figure.figsize'] = [20, 10]
    cerebro.addsizer(bt.sizers.FixedSize, stake=1000)
    # Add analyzers for Sharpe Ratio and Drawdown
    cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='sharpe_ratio')
    cerebro.addanalyzer(bt.analyzers.DrawDown, _name='draw_down')
    # Run the strategy
    cerebro.run()
    # cerebro.plot()
    results = cerebro.run()
    # Retrieve the analyzers
    sharpe_ratio_analyzer = results[0].analyzers.sharpe_ratio.get_analysis()
    drawdown_analyzer = results[0].analyzers.draw_down.get_analysis()
    final_value = cerebro.broker.getvalue()
    # Calculate the net return
    net_return = (final_value - initial_cash) / initial_cash * 100



    # Store the Sharpe Ratio in the dictionary
    sharpe_ratios[ticker] = sharpe_ratio_analyzer['sharperatio']
    draw_down_ratios[ticker] = drawdown_analyzer['max']['drawdown']
    

    # # Print the results
    # print(f'Initial Portfolio Value: ${initial_cash:,.2f}')
    # print(f'Final Portfolio Value: ${final_value:,.2f}')
    # print(f'Net Return: {net_return:.2f}%')
    # # Print the results
    # print(f"Sharpe Ratio: {sharpe_ratio_analyzer['sharperatio']}")
    # print(f"Max Drawdown: {drawdown_analyzer['max']['drawdown']}%")
    # print(f"Max Drawdown Length: {drawdown_analyzer['max']['len']}")

DAL
DE
DFS
DG
DGX
DHI
DHR
DIS
DISCA
DISCK
DISH
DLR
DLTR
DOV
DRE
DRI
DTE
DUK
DVA
DVN
DWDP
DXC
EA
EBAY
ECL
ED
EFX
EIX
EL
EMN
EMR
EOG
EQIX
EQR
EQT
ES
ESRX
ESS
ETFC
ETN
ETR
EVRG
Not enough data for ticker EVRG. Only 0 samples available.
EW
EXC
EXPD
EXPE
EXR
F
FAST
FB
FBHS
FCX
FDX
FE
FFIV
FIS
FISV
FITB
FL
FLIR
FLR
FLS
FLT
FMC
FOX
FOXA
FRT
FTI
FTNT
FTV
GD
GE
GILD
GIS
GLW
GM
GOOG
GOOGL
GPC
GPN
GPS
GRMN
GS
GT
GWW
HAL
HAS
HBAN
HBI
HCA
HCP
HD
HES
HFC
HIG
HII
HLT
HOG
HOLX
HON
HP
HPE
HPQ
HRB
HRL
HRS
HSIC
HST
HSY
HUM
IBM
ICE
IDXX
IFF
ILMN
INCY
INFO
INTC
INTU
IP
IPG
IPGP
IQV
IR
IRM
ISRG
IT
ITW
IVZ
JBHT
JCI
JEC
JEF
JNJ
JNPR
JPM
JWN
K
KEY
KHC
KIM
KLAC
KMB
KMI
KMX
KO
KORS
KR
KSS
KSU
L
LB
LEG
LEN
LH
LIN
Not enough data for ticker LIN. Only 0 samples available.
LKQ
LLL
LLY
LMT
LNC
LNT
LOW
LRCX
LUV
LYB
M
MA
MAA
MAC
MAR
MAS
MAT
MCD
MCHP
MCK
MCO
MDLZ
MDT
MET
MGM
MHK
MKC
MLM
MMC
MMM
MNST
MO
MOS
MPC
MRK
MRO
MS
MSCI
MSFT
MSI
MTB
MTD
MU
MYL
NBL
NCLH
NDAQ
NEE
NEM
NFLX
NFX
NI
NKE
NKTR
NLSN
NOC
NOV
NRG
NSC
NTAP
NT