In [1]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler



In [2]:
# Read the META.csv file from the Resources folder into a Pandas DataFrame
# Set the `Date` column as the DateTimeIndex
META_df = pd.read_csv(
    Path("../Resources/META.csv"), 
    index_col="Date",
    parse_dates=True, 
    infer_datetime_format=True
)

# Review the DataFrame
display(META_df.head())
display(META_df.tail())

  META_df = pd.read_csv(


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-19,141.210007,144.910004,132.5,133.240005,133.240005,57404900
2018-12-20,130.699997,135.570007,130.0,133.399994,133.399994,40297900
2018-12-21,133.389999,134.899994,123.419998,124.949997,124.949997,56901500
2018-12-24,123.099998,129.740005,123.019997,124.059998,124.059998,22066000
2018-12-26,126.0,134.240005,125.889999,134.179993,134.179993,39723400


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-12-12,324.600006,334.470001,324.559998,334.220001,334.220001,18485500
2023-12-13,333.929993,338.369995,332.640015,334.73999,334.73999,16353300
2023-12-14,333.850006,334.700012,328.640015,333.170013,333.170013,19607300
2023-12-15,331.98999,338.660004,331.220001,334.920013,334.920013,30001600
2023-12-18,337.480011,347.559998,337.019989,344.619995,344.619995,18993900


In [3]:
# Filter the date index and close columns
META_df = META_df.loc[:,["Close"]]
# Drop all NaN values from the DataFrame
META_df = META_df.dropna()

# Review the DataFrame
display(META_df.head())
display(META_df.tail())

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2018-12-19,133.240005
2018-12-20,133.399994
2018-12-21,124.949997
2018-12-24,124.059998
2018-12-26,134.179993


Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-12-12,334.220001
2023-12-13,334.73999
2023-12-14,333.170013
2023-12-15,334.920013
2023-12-18,344.619995


In [4]:
def calculate_bollinger_bands(META_df, window=40, num_std_dev=2):
    META_df['Rolling Mean'] = META_df['Close'].rolling(window=window).mean()
    META_df['Upper Band'] = META_df['Rolling Mean'] + (META_df['Close'].rolling(window=window).std() * num_std_dev)
    META_df['Lower Band'] = META_df['Rolling Mean'] - (META_df['Close'].rolling(window=window).std() * num_std_dev)
    return META_df


In [5]:
print(calculate_bollinger_bands)

<function calculate_bollinger_bands at 0x0000025D8A7980E0>


In [6]:
def plot_bollinger_bands(META_df):
    plt.figure(figsize=(12, 6))
    plt.plot(META_df['Close'], label='Close Price', color='blue')
    plt.plot(META_df['Rolling Mean'], label='Rolling Mean', color='black')
    plt.plot(META_df['Upper Band'], label='Upper Band', color='red', linestyle='dashed')
    plt.plot(META_df['Lower Band'], label='Lower Band', color='green', linestyle='dashed')
    plt.fill_between(META_df.index, META_df['Upper Band'], META_df['Lower Band'], color='gray', alpha=0.2)
    plt.title('Bollinger Bands Strategy')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.show()


In [7]:
def bollinger_band_strategy(META_df):
    signals = pd.DataFrame(index=META_df.index)
    signals['Signal'] = 0.0

    # Buy Signal
    signals['Signal'][META_df['Close'] < META_df['Lower Band']] = 1.0

    # Sell Signal
    signals['Signal'][META_df['Close'] > META_df['Upper Band']] = -1.0

    return signals


In [8]:
 # Calculate Bollinger Bands
stock_data = calculate_bollinger_bands(META_df)
stock_data

Unnamed: 0_level_0,Close,Rolling Mean,Upper Band,Lower Band
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-12-19,133.240005,,,
2018-12-20,133.399994,,,
2018-12-21,124.949997,,,
2018-12-24,124.059998,,,
2018-12-26,134.179993,,,
...,...,...,...,...
2023-12-12,334.220001,321.878500,348.114412,295.642589
2023-12-13,334.739990,322.147000,348.690014,295.603986
2023-12-14,333.170013,322.552000,349.264770,295.839231
2023-12-15,334.920013,323.104751,349.905377,296.304124


In [9]:
stock_data = stock_data.dropna()
stock_data

Unnamed: 0_level_0,Close,Rolling Mean,Upper Band,Lower Band
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-02-15,162.500000,148.354750,175.812597,120.896902
2019-02-19,162.289993,149.080999,176.435254,121.726745
2019-02-20,162.559998,149.809999,177.003551,122.616447
2019-02-21,160.039993,150.687249,176.834506,124.539992
2019-02-22,161.889999,151.632999,176.536061,126.729938
...,...,...,...,...
2023-12-12,334.220001,321.878500,348.114412,295.642589
2023-12-13,334.739990,322.147000,348.690014,295.603986
2023-12-14,333.170013,322.552000,349.264770,295.839231
2023-12-15,334.920013,323.104751,349.905377,296.304124


In [10]:
signals = bollinger_band_strategy(META_df) 
signals

Unnamed: 0_level_0,Signal
Date,Unnamed: 1_level_1
2018-12-19,0.0
2018-12-20,0.0
2018-12-21,0.0
2018-12-24,0.0
2018-12-26,0.0
...,...
2023-12-12,0.0
2023-12-13,0.0
2023-12-14,0.0
2023-12-15,0.0


In [11]:
stock_data["Signals"] = (bollinger_band_strategy(META_df)) 
stock_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_data["Signals"] = (bollinger_band_strategy(META_df))


Unnamed: 0_level_0,Close,Rolling Mean,Upper Band,Lower Band,Signals
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-02-15,162.500000,148.354750,175.812597,120.896902,0.0
2019-02-19,162.289993,149.080999,176.435254,121.726745,0.0
2019-02-20,162.559998,149.809999,177.003551,122.616447,0.0
2019-02-21,160.039993,150.687249,176.834506,124.539992,0.0
2019-02-22,161.889999,151.632999,176.536061,126.729938,0.0
...,...,...,...,...,...
2023-12-12,334.220001,321.878500,348.114412,295.642589,0.0
2023-12-13,334.739990,322.147000,348.690014,295.603986,0.0
2023-12-14,333.170013,322.552000,349.264770,295.839231,0.0
2023-12-15,334.920013,323.104751,349.905377,296.304124,0.0


In [12]:
stock_data['Signals'].value_counts()

Signals
 0.0    1051
-1.0     101
 1.0      67
Name: count, dtype: int64

In [14]:
# Split the data into X (features) and y (lables)

# The y variable should focus on the Default column
y = stock_data['Signals']

# The X variable should include all features except the Default column
X = stock_data.drop(columns=['Signals'])


In [15]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [17]:
# Count the distinct values in the original labels data
y_train.value_counts()

Signals
 0.0    788
-1.0     75
 1.0     51
Name: count, dtype: int64

In [18]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)


In [19]:
y_train = y_train.dropna()

In [20]:
y_train

Date
2019-10-11    0.0
2020-01-09   -1.0
2022-09-21    1.0
2019-11-01    0.0
2019-11-18    0.0
             ... 
2021-12-16    0.0
2022-09-20    0.0
2023-06-26    0.0
2020-01-23    0.0
2023-05-04    0.0
Name: Signals, Length: 914, dtype: float64

In [21]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [22]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [23]:
# Print the confusion matrix for RandomForest on the original data
confusion_matrix(y_test, rf_predictions)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[ 12,  14,   0],
       [  3, 258,   2],
       [  0,  11,   5]], dtype=int64)

In [25]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE model instance
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

# Fit the SMOTE model to the training data
X_resampled, y_resampled = smote_sampler.fit_resample(X_train, y_train)

# Fit the RandomForestClassifier on the resampled data
model_resampled_rf = RandomForestClassifier()
model_resampled_rf.fit(X_resampled, y_resampled)

# Generate predictions based on the resampled data model
rf_resampled_predictions = model_resampled_rf.predict(X_test)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype

In [26]:
# Print the confusion matrix for RandomForest on the resampled data
confusion_matrix(y_test, rf_resampled_predictions)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[ 21,   5,   0],
       [ 10, 252,   1],
       [  0,   3,  13]], dtype=int64)

In [30]:
print(classification_report(y_test, rf_predictions))

              precision    recall  f1-score   support

        -1.0       0.80      0.46      0.59        26
         0.0       0.91      0.98      0.95       263
         1.0       0.71      0.31      0.43        16

    accuracy                           0.90       305
   macro avg       0.81      0.59      0.66       305
weighted avg       0.89      0.90      0.89       305



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
