In [None]:
import pandas as pd
from hmmlearn.hmm import GaussianHMM
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

# Load dataset from csv file
csv_file = 'Z:\\Data Science\\stock price prediction\\notebook\\data\\nifty 50.csv' 
df = pd.read_csv(csv_file)

# Ensure required columns are present
required_columns = ['Open', 'High', 'Low', 'Close', 'Stock Name', 'Date']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"The dataset must contain the following columns: {required_columns}")

# Extract independent features (Open, High, Low) and prepare for HMM
features = np.column_stack([
    df['Open'],    # Open price
    df['High'],    # High price
    df['Low']      # Low price
])

# Normalize the features using StandardScaler (feature scaling)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Dependent variable is the Close price
close_prices = df['Close']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, close_prices, test_size=0.2, shuffle=False)  # Maintain temporal order

# Initialize and fit the HMM model
n_states = 3  # Number of hidden states (adjustable based on domain knowledge)
hmm_model = GaussianHMM(n_components=n_states, covariance_type="diag", n_iter=1000, random_state=42)
hmm_model.fit(X_train)

# Predict hidden states for the entire dataset
hidden_states = hmm_model.predict(features_scaled)

# Add hidden states to the dataframe
df['Hidden State'] = hidden_states

# Step 2: Use hidden states as features to train a regression model
# We'll use the hidden state as an additional feature to predict Close
X_with_hidden_states = np.column_stack([features_scaled, hidden_states])

# Split into train and test sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_with_hidden_states, close_prices, test_size=0.2, shuffle=False)

# Initialize and fit a regression model (Linear Regression)
reg_model = LinearRegression()
reg_model.fit(X_train_reg, y_train_reg)

# Predict the Close value for the entire dataset using the regression model
predicted_close = reg_model.predict(X_with_hidden_states)

# Calculate the difference between actual Close and predicted Close
df['Predicted Close'] = predicted_close
df['Difference'] = df['Close'] - df['Predicted Close']

# Create a new dataframe to show actual value, predicted value, and the difference
result_df = df[['Stock Name', 'Date', 'Close', 'Predicted Close', 'Difference']]

# Calculate MSE and RMSE
mse = mean_squared_error(df['Close'], predicted_close)
rmse = sqrt(mse)

# Print the results and the error metrics
print(result_df)
print(f"\nMean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


                   Stock Name                 Date    Close  Predicted Close  \
0      Adani Enterprises Ltd.  2023-10-16 00:00:00  2429.35      2436.733211   
1      Adani Enterprises Ltd.  2023-10-13 00:00:00  2454.55      2441.770055   
2      Adani Enterprises Ltd.  2023-10-12 00:00:00  2506.35      2508.207343   
3      Adani Enterprises Ltd.  2023-10-11 00:00:00  2488.60      2496.459294   
4      Adani Enterprises Ltd.  2023-10-10 00:00:00  2498.30      2500.936389   
...                       ...                  ...      ...              ...   
49685              Wipro Ltd.  2019-10-24 00:00:00   249.60       250.366754   
49686              Wipro Ltd.  2019-10-23 00:00:00   254.35       251.913149   
49687              Wipro Ltd.  2019-10-22 00:00:00   253.55       251.700364   
49688              Wipro Ltd.  2019-10-18 00:00:00   248.90       249.097497   
49689              Wipro Ltd.  2019-10-17 00:00:00   247.85       247.923552   

       Difference  
0       -7.383211  

In [10]:
import pandas as pd
from hmmlearn.hmm import GaussianHMM
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

# Load dataset from csv file
csv_file = 'Z:\\Data Science\\stock price prediction\\notebook\\data\\nifty 50.csv' 
df = pd.read_csv(csv_file)

# Ensure required columns are present
required_columns = ['Open', 'High', 'Low', 'Close', 'Stock Name', 'Date']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"The dataset must contain the following columns: {required_columns}")

# Extract independent features (Open, High, Low) and prepare for HMM
features = np.column_stack([
    df['Open'],    # Open price
    df['High'],    # High price
    df['Low']      # Low price
])

# Normalize the features using StandardScaler (feature scaling)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Dependent variable is the Close price
close_prices = df['Close']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, close_prices, test_size=0.2, shuffle=False)  # Maintain temporal order

# Define hyperparameters for GaussianHMM
param_grid = {
    "n_components": [2, 3, 4],  # Number of hidden states
    "covariance_type": ['diag', 'full','tied', 'spherical'],  # Covariance types
    "n_iter": [500, 1000],  # Number of iterations
    "tol": [1e-3, 1e-4]  # Convergence threshold
}

best_rmse = float("inf")
best_params = None
best_model = None

# Grid Search to find the best parameters
for n_components in param_grid["n_components"]:
    for covariance_type in param_grid["covariance_type"]:
        for n_iter in param_grid["n_iter"]:
            for tol in param_grid["tol"]:
                try:
                    # Initialize and fit the HMM model
                    hmm_model = GaussianHMM(
                        n_components=n_components,
                        covariance_type=covariance_type,
                        n_iter=n_iter,
                        tol=tol,
                        random_state=42
                    )
                    hmm_model.fit(X_train)
                    
                    # Predict hidden states
                    hidden_states = hmm_model.predict(features_scaled)
                    
                    # Add hidden states as a feature
                    X_with_hidden_states = np.column_stack([features_scaled, hidden_states])
                    
                    # Split into train and test sets for regression
                    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_with_hidden_states, close_prices, test_size=0.2, shuffle=False)
                    
                    # Initialize and fit a regression model (Linear Regression)
                    reg_model = LinearRegression()
                    reg_model.fit(X_train_reg, y_train_reg)
                    
                    # Predict the Close value
                    predicted_close = reg_model.predict(X_with_hidden_states)
                    
                    # Calculate RMSE
                    mse = mean_squared_error(close_prices, predicted_close)
                    rmse = sqrt(mse)
                    
                    # Update best model if RMSE improves
                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_params = {
                            "n_components": n_components,
                            "covariance_type": covariance_type,
                            "n_iter": n_iter,
                            "tol": tol
                        }
                        best_model = hmm_model
                
                except Exception as e:
                    print(f"Error with params {n_components}, {covariance_type}, {n_iter}, {tol}: {e}")

# Output the best parameters and corresponding RMSE
print(f"Best Parameters: {best_params}")
print(f"Best RMSE: {best_rmse}")

# Predict Close values using the best HMM and Regression model
hidden_states_best = best_model.predict(features_scaled)
X_with_best_hidden_states = np.column_stack([features_scaled, hidden_states_best])

# Refit regression model with best parameters
reg_model_best = LinearRegression()
reg_model_best.fit(X_with_best_hidden_states, close_prices)
predicted_close_best = reg_model_best.predict(X_with_best_hidden_states)

# Add predictions and differences to the dataframe
df['Predicted Close'] = predicted_close_best
df['Difference'] = df['Close'] - df['Predicted Close']

# Create a new dataframe to show actual value, predicted value, and the difference
result_df = df[['Stock Name', 'Date', 'Close', 'Predicted Close', 'Difference']]

# Print the final result and metrics
print(result_df)
print(f"\nMean Squared Error (MSE): {mean_squared_error(df['Close'], predicted_close_best)}")
print(f"Root Mean Squared Error (RMSE): {sqrt(mean_squared_error(df['Close'], predicted_close_best))}")


Best Parameters: {'n_components': 2, 'covariance_type': 'tied', 'n_iter': 500, 'tol': 0.001}
Best RMSE: 21.50480714051076
                   Stock Name                 Date    Close  Predicted Close  \
0      Adani Enterprises Ltd.  2023-10-16 00:00:00  2429.35      2436.252019   
1      Adani Enterprises Ltd.  2023-10-13 00:00:00  2454.55      2441.961636   
2      Adani Enterprises Ltd.  2023-10-12 00:00:00  2506.35      2508.569896   
3      Adani Enterprises Ltd.  2023-10-11 00:00:00  2488.60      2496.652775   
4      Adani Enterprises Ltd.  2023-10-10 00:00:00  2498.30      2501.520273   
...                       ...                  ...      ...              ...   
49685              Wipro Ltd.  2019-10-24 00:00:00   249.60       250.376111   
49686              Wipro Ltd.  2019-10-23 00:00:00   254.35       252.715329   
49687              Wipro Ltd.  2019-10-22 00:00:00   253.55       251.735061   
49688              Wipro Ltd.  2019-10-18 00:00:00   248.90       249.908884  