In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("processed_stocks_data.csv")

In [3]:
df.drop("Volatility",axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,Stock,Date,Open,High,Low,Close,Volume,Log Return
0,AAPL,2025-03-04,241.79,244.0272,234.68,235.93,100982047,0.0
1,AAPL,2025-02-28,229.99,250.0,225.7,241.84,862317019,0.024741
2,AAPL,2025-01-31,248.93,249.1,219.38,236.0,1200291603,-0.024445
3,AAPL,2024-12-31,237.27,260.1,237.16,250.42,977942014,0.059308
4,AAPL,2024-11-29,220.965,237.81,219.71,237.33,891640714,-0.053688


In [5]:
import numpy as np
from pycaret.regression import *


In [6]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [7]:
df = pd.get_dummies(df, columns=['Stock'])

In [8]:
df.drop(columns=['Date'], inplace=True)

In [9]:
target = "Log Return"
features = [col for col in df.columns if col != target]

In [10]:
exp = setup(df, target=target, session_id=42,  verbose=False)

In [11]:
best_model = compare_models(sort="RMSE")

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.0912,0.0391,0.1745,-0.0128,0.1146,1.4094,0.025
dummy,Dummy Regressor,0.0917,0.0392,0.1747,-0.0171,0.1212,1.1056,0.022
omp,Orthogonal Matching Pursuit,0.0917,0.0392,0.1747,-0.0179,0.1212,1.1014,0.023
lasso,Lasso Regression,0.0939,0.0388,0.1752,-0.0379,0.1131,1.624,0.025
llar,Lasso Least Angle Regression,0.0939,0.0388,0.1752,-0.0379,0.1131,1.624,0.023
en,Elastic Net,0.0943,0.0391,0.1758,-0.0441,0.1129,1.6577,0.025
br,Bayesian Ridge,0.0963,0.0419,0.1811,-0.0959,0.1138,1.9111,0.022
ridge,Ridge Regression,0.098,0.0425,0.1825,-0.1196,0.1129,2.1704,0.025
lr,Linear Regression,0.0981,0.0425,0.1826,-0.1198,0.1129,2.1722,1.134
lightgbm,Light Gradient Boosting Machine,0.1036,0.0415,0.1844,-0.2078,0.107,3.4717,0.226


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [12]:
final_model = finalize_model(best_model)

In [13]:
print(f"Best Model Selected: {best_model}")

Best Model Selected: HuberRegressor()


In [14]:
save_model(final_model, "best_stock_model")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['Open', 'High', 'Low', 'Close',
                                              'Volume', 'Year', 'Month', 'Day'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('clean_column_names',
                  TransformerWrapper(transformer=CleanColumnNames())),
                 ('actual_estimator', HuberRegressor())]),
 'best_stock_model.pkl')

In [15]:
from pycaret.regression import *
import numpy as np

# Load the trained model
best_model = load_model("best_stock_model")

# 1️⃣ Evaluate the Model
print("Model Evaluation Metrics:")
evaluate_model(best_model)

# 2️⃣ Prepare New Data for Prediction
latest_row = {
    "Stock_AAPL": 1,  # Set 1 for the stock you want to predict, 0 for others
    "Stock_TSLA": 0,
    "Stock_AMZN": 0,
    "Stock_GOOGL": 0,
    "Stock_MSFT": 0,
    "Open": 241.79,
    "High": 244.02,
    "Low": 234.68,
    "Close": 235.93,
    "Volume": 100982047,
    "Year": 2025,
    "Month": 4,
    "Day": 1,  # Predicting next month
}

# Convert dictionary to DataFrame
latest_df = pd.DataFrame([latest_row])

# 3️⃣ Predict Log Return
predicted_df = predict_model(best_model, data=latest_df)
print(predicted_df.columns)  # Debugging step to check column names

# Use the correct column name (likely 'prediction' instead of 'Label')
predicted_log_return = predicted_df.iloc[:, -1][0]  # Get last column


# 4️⃣ Convert Log Return to Price
current_price = latest_row["Close"]
predicted_price = current_price * np.exp(predicted_log_return)

print(f"Predicted Log Return: {predicted_log_return:.5f}")
print(f"Predicted Stock Price for AAPL: ${predicted_price:.2f}")


Transformation Pipeline and Model Successfully Loaded
Model Evaluation Metrics:


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

Index(['Stock_AAPL', 'Stock_TSLA', 'Stock_AMZN', 'Stock_GOOGL', 'Stock_MSFT',
       'Open', 'High', 'Low', 'Close', 'Volume', 'Year', 'Month', 'Day',
       'prediction_label'],
      dtype='object')
Predicted Log Return: -0.01655
Predicted Stock Price for AAPL: $232.06


In [16]:
df.head()

Unnamed: 0,Open,High,Low,Close,Volume,Log Return,Year,Month,Day,Stock_AAPL,Stock_AMZN,Stock_GOOGL,Stock_MSFT,Stock_TSLA
0,241.79,244.0272,234.68,235.93,100982047,0.0,2025,3,4,True,False,False,False,False
1,229.99,250.0,225.7,241.84,862317019,0.024741,2025,2,28,True,False,False,False,False
2,248.93,249.1,219.38,236.0,1200291603,-0.024445,2025,1,31,True,False,False,False,False
3,237.27,260.1,237.16,250.42,977942014,0.059308,2024,12,31,True,False,False,False,False
4,220.965,237.81,219.71,237.33,891640714,-0.053688,2024,11,29,True,False,False,False,False


In [17]:
import pandas as pd
import numpy as np
from pycaret.regression import predict_model

# Function to get predicted returns for all stocks
def predict_returns(latest_data, model):
    predicted_returns = {}
    
    for stock in ['AAPL', 'TSLA', 'AMZN', 'GOOGL', 'MSFT']:
        stock_col = f"Stock_{stock}"
        
        # Filter only the row where stock_col is True
        latest_row = latest_data[latest_data[stock_col] == True].copy()
        if latest_row.empty:
            continue
        
        # Predict log return
        predicted_log_return = predict_model(model, data=latest_row)['prediction_label'].values[0]
        predicted_returns[stock] = predicted_log_return
    
    return predicted_returns

# Function to allocate investment based on strategy
# Function to allocate investment while ignoring negative returns
def allocate_investment(predicted_returns, investment_amount=10000, diversify=True):
    # Filter out negative-return stocks
    positive_returns = {s: r for s, r in predicted_returns.items() if r > 0}
    
    if not positive_returns:
        return {"No suitable stock found": investment_amount}  # Handle case where all returns are negative
    
    sorted_returns = sorted(positive_returns.items(), key=lambda x: x[1], reverse=True)
    
    if diversify:
        top_stocks = sorted_returns[:3]  # Select top 3 stocks
        total_return = sum([r[1] for r in top_stocks])
        allocations = {s: (r / total_return) * investment_amount for s, r in top_stocks}
    else:
        top_stock = sorted_returns[0]  # Select highest return stock
        allocations = {top_stock[0]: investment_amount}
    
    return allocations


# Sample latest data (replace with actual latest row)
latest_data = pd.DataFrame({
    'Open': [240, 180, 3200, 2800, 300],
    'High': [245, 190, 3250, 2850, 310],
    'Low': [230, 170, 3150, 2750, 290],
    'Close': [235, 185, 3220, 2820, 305],
    'Volume': [10000000, 8000000, 5000000, 3000000, 6000000],
    'Year': [2025] * 5,
    'Month': [3] * 5,
    'Day': [4] * 5,
    'Stock_AAPL': [True, False, False, False, False],
    'Stock_TSLA': [False, True, False, False, False],
    'Stock_AMZN': [False, False, True, False, False],
    'Stock_GOOGL': [False, False, False, True, False],
    'Stock_MSFT': [False, False, False, False, True]
})

# Predict returns
predicted_returns = predict_returns(latest_data, best_model)

# Get investment strategies
single_stock_investment = allocate_investment(predicted_returns, diversify=False)
diversified_investment = allocate_investment(predicted_returns, diversify=True)

print("Single Stock Investment Strategy:", single_stock_investment)
print("Diversified Investment Strategy:", diversified_investment)


Single Stock Investment Strategy: {'AMZN': 10000}
Diversified Investment Strategy: {'AMZN': 5541.49813083917, 'GOOGL': 4458.501869160831}


In [18]:
import numpy as np
import pandas as pd
from pycaret.regression import predict_model

# Get actual vs predicted log returns
def evaluate_model(df, model):
    df = df.copy()
    
    # Predict log returns using the trained model
    df['Predicted_Log_Return'] = predict_model(model, data=df)['prediction_label']
    
    # Calculate error metrics
    df['Error'] = df['Predicted_Log_Return'] - df['Log Return']
    mae = np.mean(np.abs(df['Error']))
    mse = np.mean(df['Error'] ** 2)
    
    print(f"Mean Absolute Error (MAE): {mae:.5f}")
    print(f"Mean Squared Error (MSE): {mse:.5f}")
    
    # Display actual vs. predicted returns for last few months
    print(df[[ 'Year', 'Month', 'Log Return', 'Predicted_Log_Return']].tail(10))

# Run evaluation on historical data
evaluate_model(df, best_model)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,0.0905,0.0332,0.1822,0.0086,0.1193,1.4511


Mean Absolute Error (MAE): 0.09049
Mean Squared Error (MSE): 0.03320
      Year  Month  Log Return  Predicted_Log_Return
1326  2011      4   -0.088037             -0.021283
1327  2011      3    0.005420             -0.021275
1328  2011      2   -0.149776             -0.021350
1329  2011      1    0.008752             -0.021325
1330  2010     12    0.099827             -0.021112
1331  2010     11    0.282694             -0.021246
1332  2010     10   -0.480989             -0.021647
1333  2010      9   -0.067963             -0.021525
1334  2010      8   -0.046392             -0.021574
1335  2010      7    0.023339             -0.020983


In [19]:
def calculate_sharpe_ratio(df, predicted_returns):
    sharpe_ratios = {}
    risk_free_rate = 0.02  # Assume 2% annual risk-free rate

    for stock in predicted_returns.keys():
        stock_data = df[df[f'Stock_{stock}'] == True]  # Filter stock data
        volatility = stock_data['Log Return'].std()  # Standard deviation of log returns

        if volatility == 0:
            sharpe_ratios[stock] = -np.inf  # Avoid division by zero
        else:
            sharpe_ratios[stock] = (predicted_returns[stock] - risk_free_rate) / volatility

    return sharpe_ratios

# Compute Sharpe Ratio
sharpe_ratios = calculate_sharpe_ratio(df, predicted_returns)

# Select the best stock based on Sharpe Ratio
best_stock_sharpe = max(sharpe_ratios, key=sharpe_ratios.get)
print(f"Stock with highest risk-adjusted return: {best_stock_sharpe} (Sharpe Ratio: {sharpe_ratios[best_stock_sharpe]:.2f})")


Stock with highest risk-adjusted return: AMZN (Sharpe Ratio: 0.09)
