1. Python packages

In [1]:
from data.data_handler import DataHandler
import numpy as np
import matplotlib.pyplot as plt

2. Load data

In [2]:
# Get the dataset
dh = DataHandler()
df = dh.get_stock_dataset(stock_ticket="PFE")
df = df.sort_values(by=['Date'])

In [3]:
# Change the index to the date
df.index = df['Date']
df = df.drop(['Date'], axis=1)
df.drop(df.tail(1).index,inplace=True)
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1972-06-01,0.000000,0.815346,0.802993,0.815346,0.174965,2458771,-1
1972-06-02,0.815346,0.817817,0.802993,0.805463,0.172844,1613885,-1
1972-06-05,0.805463,0.807934,0.798051,0.802993,0.172314,2585251,-1
1972-06-06,0.802993,0.825229,0.800522,0.820288,0.176026,2347469,-1
1972-06-07,0.820288,0.820288,0.807934,0.820288,0.176026,1032077,-1
...,...,...,...,...,...,...,...
2023-04-21,40.090000,40.299999,39.910000,40.209999,40.209999,19220900,-1
2023-04-24,40.189999,40.200001,39.709999,39.910000,39.910000,17633700,-1
2023-04-25,39.750000,39.919998,39.279999,39.330002,39.330002,24492400,-1
2023-04-26,39.160000,39.189999,38.400002,38.630001,38.630001,22401400,1


In [4]:
# Create the target columns
df['Open-Close'] = df.Open - df.Close
df['High-Low'] = df.High - df.Low

X = df[['Open-Close', 'High-Low']].to_numpy()
X

array([[-0.815346  ,  0.012353  ],
       [ 0.00988299,  0.01482397],
       [ 0.00247002,  0.00988299],
       ...,
       [ 0.41999817,  0.63999939],
       [ 0.52999878,  0.7899971 ],
       [-0.12000275,  0.52000046]])

In [5]:
# Target variables
y = np.where(df['Close'].shift(-1) > df['Close'], 1, 0)
y


array([0, 0, 1, ..., 0, 1, 0])

3. Split data

In [6]:
# Split the data set into train and test sets
split_percentage = 0.8
split = int(split_percentage*len(df))

# Train data set
X_train = X[:split]
y_train = y[:split]

# Test data set
X_test = X[split:]
y_test = y[split:]

print('x_train.shape = ', X_train.shape)
print('y_train.shape = ', y_train.shape)
print('x_test.shape = ', X_test.shape)
print('y_test.shape = ', y_test.shape)

x_train.shape =  (10269, 2)
y_train.shape =  (10269,)
x_test.shape =  (2568, 2)
y_test.shape =  (2568,)


4. Normalize data

In [7]:
# Implement a min-max scaler
class MinMaxScaler:
    def __init__(self, feature_range=(0, 1)):
        self.feature_range = feature_range

    def fit(self, X):
        self.min_ = np.min(X, axis=0)
        self.max_ = np.max(X, axis=0)
        return self

    def transform(self, X):
        X_std = (X - self.min_) / (self.max_ - self.min_)
        X_scaled = X_std * (self.feature_range[1] - self.feature_range[0]) + self.feature_range[0]
        return X_scaled

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)
    
    def inverse_transform(self, X):
        X_std = (X - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0])
        X_scaled = X_std * (self.max_ - self.min_) + self.min_
        return X_scaled

# Normalize the dataset
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

5. Build model

In [8]:
# Implement the SVM class
class SVM:
    def __init__(self, learning_rate=0.01, lambda_param=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.num_iterations = num_iterations
    
    def fit(self, X, y):
        # Initialize parameters
        m, n = X.shape
        self.w = np.zeros(n)
        self.b = 0
        
        # Gradient descent
        for iteration in range(self.num_iterations):
            # Compute gradient
            margin = y * (np.dot(X, self.w) + self.b)
            misclassified_indices = np.where(margin < 1)[0]
            d_w = self.w - self.lambda_param * np.sum(y[misclassified_indices].reshape(-1, 1) * X[misclassified_indices], axis=0)
            d_b = - self.lambda_param * np.sum(y[misclassified_indices])
            
            # Update parameters
            self.w -= self.learning_rate * d_w
            self.b -= self.learning_rate * d_b
    
    def predict(self, X):
        return np.sign(np.dot(X, self.w) + self.b)


In [9]:
# Train the SVM classifier
svm = SVM()
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Compute accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy}')
print(y_pred, y_test)

Accuracy: 0.4941588785046729
[1. 1. 1. ... 1. 1. 1.] [1 1 0 ... 0 1 0]


In [10]:
# Define number of folds for k-fold cross-validation
k = 5

# Shuffle data
np.random.seed(42)
indices = np.random.permutation(len(X))
X = X[indices]
y = y[indices]

# Split data into k folds
X_folds = np.array_split(X, k)
y_folds = np.array_split(y, k)

# Loop over folds
scores = []
for i in range(k):
    # Prepare training and testing data for this fold
    X_train = np.concatenate([X_folds[j] for j in range(k) if j != i])
    y_train = np.concatenate([y_folds[j] for j in range(k) if j != i])
    X_test = X_folds[i]
    y_test = y_folds[i]
    
    # Train SVM model on training data
    svm = SVM()
    svm.fit(X_train, y_train)
    
    # Make predictions on test data
    y_pred = svm.predict(X_test)
    
    # Compute f1 score for this fold
    tp = np.sum((y_pred == 1) & (y_test == 1))
    fp = np.sum((y_pred == 1) & (y_test == -1))
    fn = np.sum((y_pred == -1) & (y_test == 1))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    scores.append(f1)

# Compute mean f1 score over all folds
mean_f1 = np.mean(scores)
print(f'Mean f1 score: {mean_f1}')


Mean f1 score: 1.0
