In [1]:
# Import necessary libraries
import yfinance as yf  
import pandas as pd  
import numpy as np
 

# Machine Learning
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from prophet import Prophet
from sklearn.svm import SVR

# Additional
from math import sqrt  

print("Dependencies imported successfully.")


Dependencies imported successfully.


In [42]:
def fetch_data(ticker, start_date="2020-01-01", end_date="2023-01-01"):
    """
    Fetches historical stock data for the specified ticker from Yahoo Finance.
    
    Parameters:
    ticker (str): The stock symbol for which to download the data.
    start_date (str): The start date for the data download (format: YYYY-MM-DD).
    end_date (str): The end date for the data download (format: YYYY-MM-DD).
    
    Returns:
    pandas.DataFrame: A DataFrame containing the historical stock data.
    """
    data = yf.download(ticker, start=start_date, end=end_date)
    # Calculate daily returns as a feature
    data['Daily Return'] = data['Adj Close'].pct_change()
    # Remove any NaN values that might have been introduced
    data.dropna(inplace=True)
    return data

def prepare_data(data, target_column='Daily Return', test_size=0.2, random_state=42):
    """
    Prepares the data for modeling, splitting into features and target, and then into training and testing sets.
    
    Parameters:
    data (pandas.DataFrame): The stock data.
    target_column (str): The column to be used as the target variable.
    test_size (float): The proportion of the dataset to include in the test split.
    random_state (int): Controls the shuffling applied to the data before applying the split.
    
    Returns:
    tuple: A tuple containing the training and testing datasets (X_train, X_test, y_train, y_test).
    """
    # Using previous day's return as the feature to predict today's return
    X = data[[target_column]].shift().dropna()
    y = data[target_column][1:]
    
    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

print("Data fetching and preparation functions defined successfully.")


Data fetching and preparation functions defined successfully.


In [44]:
# create a dictionary to store the results of each model for use when comparing and visualizing the models.
regression_results =[]
# define a function that loops through the models, fits each model to the training data, 
# makes predictions on the testing set, and evaluates the model's performance with RMSE and R² metrics.
def test_regression_model(model, X_train, X_test, y_train, y_test, parameter_hint, parameter_value, range_hint):
    """
    Fits a regression model to the training data, makes predictions on the testing set,
    and evaluates the model's performance with RMSE and R² metrics.
    
    Parameters:
    model (estimator): The regression model to test.
    X_train (array-like): The training features.
    X_test (array-like): The testing features.
    y_train (array-like): The training target variable.
    y_test (array-like): The testing target variable.
    parameter_hint (str): Description of the model parameter to adjust.
    parameter_value (str): The current value of the parameter.
    range_hint (str): Suggested range for the parameter value.
    
    Prints the model's performance metrics (RMSE and R²).
    """
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the testing set
    predictions = model.predict(X_test)
    
    # Evaluate the model's performance
    rmse = sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)
    
    print(f"{model.__class__.__name__} Performance:")
    print(f"RMSE: {rmse}")
    print(f"R²: {r2}")
    print("____________________________________________")

def run_regression_tests(data):
    """
    Prepares the data, tests multiple regression models, prints their performance,
    and provides parameter hints with current values and suggested ranges.
    
    Parameters:
    data (pandas.DataFrame): The stock data prepared for regression analysis.
    """
    # Prepare the data
    X_train, X_test, y_train, y_test = prepare_data(data)
    
    # Define the models to test along with parameter hints
    models = [
        (LinearRegression(), "fit_intercept", "True/False", "Include/Exclude the intercept in the model"),
        (KNeighborsRegressor(), "n_neighbors", "5", "1 to 20"),
        (RandomForestRegressor(), "n_estimators", "100", "10 to 1000"),
        (ExtraTreesRegressor(), "n_estimators", "100", "10 to 1000"),
        (AdaBoostRegressor(), "n_estimators", "50", "10 to 500"),
        (SVR(), "C and epsilon", "C=1.0, epsilon=0.2", "C: 0.01 to 100, epsilon: 0.01 to 1"),
    ]
    
    # Test each model and print performance
    for model, parameter_hint, parameter_value, range_hint in models:
        test_regression_model(model, X_train, X_test, y_train, y_test, parameter_hint, parameter_value, range_hint)
        
    # List to store the results of regression models
    results = []

    # Test each model and store results
    for model, parameter_hint, parameter_value, range_hint in models:
        predictions = model.predict(X_test)
        rmse = sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)
        result = {
            'model_name': model.__class__.__name__,
            'parameter_hint': parameter_hint,
            'parameter_value': parameter_value,
            'range_hint': range_hint,
            'rmse': rmse,
            'r_squared': r2
        }
        results.append(result)
    # Append results to regression_results
    regression_results.append(results)
    # Print parameter hints and ranges
    print("Parameter Hints\n================================")
    for result in results:
        print(f"# Change the value of {result['parameter_hint']} to adjust the model's score: {result['parameter_value']}")
        print(f"\"{result['parameter_hint']}\" range can be between: {result['range_hint']}\n")
    print("================================")

    # Print parameter hints and ranges
    print("Parameter Hints\n================================")
    for model, parameter_hint, parameter_value, range_hint in models:
        print(f"# Change the value of {parameter_hint} to adjust the model's score: {model.__class__.__name__}({parameter_value})")
        print(f"\"{parameter_hint}\" range can be between: {range_hint}\n")
    print("================================")

# Example usage
ticker = 'SPY'
data = fetch_data(ticker)
run_regression_tests(data)



[*********************100%%**********************]  1 of 1 completed

LinearRegression Performance:
RMSE: 0.016927563443294432
R²: 0.003016436394273736
____________________________________________
KNeighborsRegressor Performance:
RMSE: 0.01919580410643575
R²: -0.2820699374737581
____________________________________________





RandomForestRegressor Performance:
RMSE: 0.02103713014323497
R²: -0.5398275496763705
____________________________________________
ExtraTreesRegressor Performance:
RMSE: 0.023413912437426382
R²: -0.9074232757599321
____________________________________________
AdaBoostRegressor Performance:
RMSE: 0.019713405162654393
R²: -0.35214228335274167
____________________________________________
SVR Performance:
RMSE: 0.021693314060227124
R²: -0.6373853697348746
____________________________________________
Parameter Hints
# Change the value of fit_intercept to adjust the model's score: True/False
"fit_intercept" range can be between: Include/Exclude the intercept in the model

# Change the value of n_neighbors to adjust the model's score: 5
"n_neighbors" range can be between: 1 to 20

# Change the value of n_estimators to adjust the model's score: 100
"n_estimators" range can be between: 10 to 1000

# Change the value of n_estimators to adjust the model's score: 100
"n_estimators" range can be bet

In [47]:
regression_results


[[{'model_name': 'LinearRegression',
   'parameter_hint': 'fit_intercept',
   'parameter_value': 'True/False',
   'range_hint': 'Include/Exclude the intercept in the model',
   'rmse': 0.016927563443294432,
   'r_squared': 0.003016436394273736},
  {'model_name': 'KNeighborsRegressor',
   'parameter_hint': 'n_neighbors',
   'parameter_value': '5',
   'range_hint': '1 to 20',
   'rmse': 0.01919580410643575,
   'r_squared': -0.2820699374737581},
  {'model_name': 'RandomForestRegressor',
   'parameter_hint': 'n_estimators',
   'parameter_value': '100',
   'range_hint': '10 to 1000',
   'rmse': 0.02103713014323497,
   'r_squared': -0.5398275496763705},
  {'model_name': 'ExtraTreesRegressor',
   'parameter_hint': 'n_estimators',
   'parameter_value': '100',
   'range_hint': '10 to 1000',
   'rmse': 0.023413912437426382,
   'r_squared': -0.9074232757599321},
  {'model_name': 'AdaBoostRegressor',
   'parameter_hint': 'n_estimators',
   'parameter_value': '50',
   'range_hint': '10 to 500',
   

# DATA VISUALIZATION WITH PLOTY

In [48]:
import plotly.graph_objects as go

def visualize_regression_results(regression_results):
    """
    Creates Plotly visualizations of regression model performance metrics.
    
    Parameters:
    regression_results (list): A list containing dictionaries of regression model results.
    """
    # Create lists to store model names and performance metrics
    model_names = []
    rmse_values = []
    r_squared_values = []

    # Extract model names and performance metrics from regression results
    for result in regression_results[0]:
        model_names.append(result['model_name'])
        rmse_values.append(result['rmse'])
        r_squared_values.append(result['r_squared'])

    # Create bar plot for RMSE values
    fig_rmse = go.Figure(data=[go.Bar(x=model_names, y=rmse_values)])
    fig_rmse.update_layout(title="Root Mean Squared Error (RMSE) for Regression Models",
                           xaxis_title="Regression Models",
                           yaxis_title="RMSE")

    # Create bar plot for R-squared values
    fig_r_squared = go.Figure(data=[go.Bar(x=model_names, y=r_squared_values)])
    fig_r_squared.update_layout(title="R-squared (R²) for Regression Models",
                                xaxis_title="Regression Models",
                                yaxis_title="R²")

    # Display the plots
    fig_rmse.show()
    fig_r_squared.show()

# Example usage
visualize_regression_results(regression_results)


In [49]:
import plotly.graph_objs as go
import plotly.express as px

# Extracting data from regression_results
results = [
    {'model_name': 'LinearRegression', 'parameter_hint': 'fit_intercept', 'parameter_value': 'True/False',
     'range_hint': 'Include/Exclude the intercept in the model', 'rmse': 0.016927563443294432,
     'r_squared': 0.003016436394273736},
    {'model_name': 'KNeighborsRegressor', 'parameter_hint': 'n_neighbors', 'parameter_value': '5',
     'range_hint': '1 to 20', 'rmse': 0.01919580410643575, 'r_squared': -0.2820699374737581},
    {'model_name': 'RandomForestRegressor', 'parameter_hint': 'n_estimators', 'parameter_value': '100',
     'range_hint': '10 to 1000', 'rmse': 0.02103713014323497, 'r_squared': -0.5398275496763705},
    {'model_name': 'ExtraTreesRegressor', 'parameter_hint': 'n_estimators', 'parameter_value': '100',
     'range_hint': '10 to 1000', 'rmse': 0.023413912437426382, 'r_squared': -0.9074232757599321},
    {'model_name': 'AdaBoostRegressor', 'parameter_hint': 'n_estimators', 'parameter_value': '50',
     'range_hint': '10 to 500', 'rmse': 0.019713405162654393, 'r_squared': -0.35214228335274167},
    {'model_name': 'SVR', 'parameter_hint': 'C and epsilon', 'parameter_value': 'C=1.0, epsilon=0.2',
     'range_hint': 'C: 0.01 to 100, epsilon: 0.01 to 1', 'rmse': 0.021693314060227124,
     'r_squared': -0.6373853697348746}
]

# Create DataFrame for visualization
df_results = pd.DataFrame(results)

# Plot RMSE and R-squared
fig = go.Figure()
fig.add_trace(go.Bar(x=df_results['model_name'], y=df_results['rmse'], name='RMSE', marker_color='royalblue'))
fig.add_trace(go.Bar(x=df_results['model_name'], y=df_results['r_squared'], name='R-squared', marker_color='darkorange'))
fig.update_layout(title='Model Performance Comparison',
                  xaxis_title='Model',
                  yaxis_title='Value',
                  barmode='group')
fig.show()


the provided regression results, we have the performance metrics (RMSE and R-squared) for different regression models applied to the dataset. Let me explain each metric:

RMSE (Root Mean Squared Error):

RMSE measures the average deviation of the predicted values from the actual values.
Lower RMSE values indicate better fit of the model to the data.
It represents the standard deviation of the residuals, which are the differences between predicted and observed values.
R-squared (Coefficient of Determination):

R-squared represents the proportion of the variance in the dependent variable (target) that is predictable from the independent variables (features).
It ranges from 0 to 1, where 0 indicates that the model does not explain any variability in the target variable, and 1 indicates that the model perfectly explains the variability.
Higher R-squared values indicate better fit of the model to the data.
Now, interpreting the specific results:

For each regression model (e.g., Linear Regression, KNeighborsRegressor, RandomForestRegressor, etc.), we have RMSE and R-squared values.
Lower RMSE values and higher R-squared values indicate better model performance.
We compare these metrics across different models to determine which model performs best for our dataset.
In the Plotly visualization, you can see the RMSE and R-squared values side by side for each model, making it easy to compare their performance visually.