In [12]:
import warnings
warnings.filterwarnings("ignore")
import os
import joblib
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [13]:
TRAINING_TICKER = "MSFT"
N_DAYS_LOOKBACK = 15
PREDICT_SHIFT = 1
MODEL_FILE = "ridge_stock_model.pkl"
SCALER_FILE = "scaler.pkl"

plt.style.use('bmh') 

In [14]:
def fetch_data(ticker, start_date="2015-01-01", end_date=None):
    print(f"Fetching data for {ticker} from {start_date} to {end_date or 'today'}...")
    try:
        df = yf.download(ticker, start=start_date, end=end_date, progress=False)
        if df is None or df.empty:
            print(f"No data returned for {ticker}.")
            return pd.DataFrame()
        
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.get_level_values(0)
            
        return df
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return pd.DataFrame()


In [15]:
def create_features(df, n_lookback=N_DAYS_LOOKBACK, predict_shift=PREDICT_SHIFT):   
    if df.empty:
        return pd.DataFrame()

    df = df.copy()
    df.sort_index(inplace=True)
    
    df.ffill(inplace=True)
    df.bfill(inplace=True)

    df["SMA_10"] = df["Close"].rolling(window=10).mean()
    df["EMA_5"] = df["Close"].ewm(span=5, adjust=False).mean()
    df["Volatility_10"] = df["Close"].rolling(window=10).std()
    df["Daily_Return"] = df["Close"].pct_change()

    for i in range(1, n_lookback + 1):
        df[f"Close_Lag_{i}"] = df["Close"].shift(i)

    df["Target"] = df["Close"].shift(-predict_shift)

    df.dropna(inplace=True)

    return df

In [16]:
def prepare_xy_from_df(df):
    feature_cols = [
        "SMA_10",
        "EMA_5",
        "Volatility_10",
        "Daily_Return",
    ] + [f"Close_Lag_{i}" for i in range(1, N_DAYS_LOOKBACK + 1)]

    missing = [c for c in feature_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required feature columns: {missing}")

    X = df[feature_cols].copy()
    y = df["Target"].copy()

    return X, y

In [17]:
def plot_training_results(y_test, y_pred, ticker):
    fig = plt.figure(figsize=(15, 10))
    gs = gridspec.GridSpec(2, 2, figure=fig)

    ax1 = fig.add_subplot(gs[0, :]) 
    ax1.plot(y_test.index, y_test.values, label="Actual Price", color='#1f77b4', linewidth=2)
    ax1.plot(y_test.index, y_pred, label="Predicted Price", color='#ff7f0e', alpha=0.8, linestyle='--')
    ax1.set_title(f"Model Performance: Actual vs Predicted (Test Set) - {ticker}", fontsize=14)
    ax1.set_ylabel("Price (USD)")
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    residuals = y_test - y_pred
    ax2 = fig.add_subplot(gs[1, 0])
    ax2.hist(residuals, bins=30, color='purple', alpha=0.7, edgecolor='black')
    ax2.set_title("Residuals Distribution (Error Histogram)", fontsize=12)
    ax2.set_xlabel("Prediction Error (USD)")
    ax2.set_ylabel("Frequency")
    ax2.axvline(0, color='black', linestyle='--', linewidth=1)

    ax3 = fig.add_subplot(gs[1, 1])
    ax3.scatter(y_test, y_pred, alpha=0.5, color='green')
    
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    ax3.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Fit')
    
    ax3.set_title("Actual vs Predicted Scatter", fontsize=12)
    ax3.set_xlabel("Actual Price")
    ax3.set_ylabel("Predicted Price")
    ax3.legend()

    plt.tight_layout()
  
    plt.show()

In [18]:
def train_model(X, y, model_file=MODEL_FILE, scaler_file=SCALER_FILE):
    print("\n--- Training model ---")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=False, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = Ridge(alpha=1.0)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Training completed.")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R2 Score: {r2:.4f}")

    joblib.dump(model, model_file)
    joblib.dump(scaler, scaler_file)
    print(f"Saved model to '{model_file}' and scaler to '{scaler_file}'.")

    plot_training_results(y_test, y_pred, TRAINING_TICKER)

    return model, scaler


In [19]:
def load_model_and_scaler(model_file=MODEL_FILE, scaler_file=SCALER_FILE):
    if not os.path.exists(model_file) or not os.path.exists(scaler_file):
        return None, None
    model = joblib.load(model_file)
    scaler = joblib.load(scaler_file)
    return model, scaler


In [20]:
def plot_stock_analysis(df, ticker, predicted_price, last_date):
    
    subset = df.iloc[-90:].copy()
    
    fig = plt.figure(figsize=(14, 10))
    gs = gridspec.GridSpec(3, 1, height_ratios=[3, 1, 1], figure=fig)

    ax1 = fig.add_subplot(gs[0])
    ax1.plot(subset.index, subset['Close'], label='Close Price', color='black', linewidth=1.5)
    ax1.plot(subset.index, subset['SMA_10'], label='SMA (10)', color='blue', alpha=0.6, linewidth=1)
    ax1.plot(subset.index, subset['EMA_5'], label='EMA (5)', color='orange', alpha=0.6, linewidth=1)
    
    next_date = subset.index[-1] + pd.Timedelta(days=1)
    
    ax1.plot([subset.index[-1], next_date], 
             [subset['Close'].iloc[-1], predicted_price], 
             color='red', linestyle='--', alpha=0.7)
    
    ax1.scatter(next_date, predicted_price, color='red', s=150, marker='*', label=f'Prediction: ${predicted_price:.2f}', zorder=5)
    
    ax1.set_title(f"{ticker} Price Analysis & Prediction", fontsize=16)
    ax1.set_ylabel("Price (USD)")
    ax1.legend(loc='upper left')
    ax1.grid(True, alpha=0.3)

    ax2 = fig.add_subplot(gs[1], sharex=ax1)
    colors = ['green' if x >= 0 else 'red' for x in subset['Daily_Return']]
    ax2.bar(subset.index, subset['Volume'], color=colors, alpha=0.7, width=0.6)
    ax2.set_ylabel("Volume")
    ax2.grid(True, alpha=0.3)
    
    ax3 = fig.add_subplot(gs[2], sharex=ax1)
    ax3.plot(subset.index, subset['Volatility_10'], color='purple', label='Volatility (10-day Std)')
    ax3.fill_between(subset.index, subset['Volatility_10'], color='purple', alpha=0.1)
    ax3.set_ylabel("Volatility")
    ax3.set_xlabel("Date")
    ax3.legend(loc='upper left')
    ax3.grid(True, alpha=0.3)

    plt.tight_layout()
    
    safe_ticker = ticker.replace("^", "")
   
    plt.show()

In [21]:
def predict_next_close_for_ticker(ticker, model, scaler, start_date="2020-01-01"):
    
    if model is None or scaler is None:
        print("Model or scaler not provided.")
        return None

    df = fetch_data(ticker, start_date=start_date)
    if df.empty:
        print("No data available for prediction.")
        return None

    df_feat = create_features(df, n_lookback=N_DAYS_LOOKBACK, predict_shift=PREDICT_SHIFT)
    
    if df_feat.empty:
        print("Not enough data to create features.")
        return None

    last_row = df_feat.iloc[-1]
    
    feature_cols = [
        "SMA_10",
        "EMA_5",
        "Volatility_10",
        "Daily_Return",
    ] + [f"Close_Lag_{i}" for i in range(1, N_DAYS_LOOKBACK + 1)]

    try:
        X_new = last_row[feature_cols].values.reshape(1, -1)
    except KeyError as e:
        print(f"Missing columns in data: {e}")
        return None

    X_new_scaled = scaler.transform(X_new)
    predicted_price = float(model.predict(X_new_scaled)[0])

    current_close = float(df["Close"].iloc[-1])
    last_date = df.index[-1].strftime("%Y-%m-%d")

    print(f"\n{'='*40}")
    print(f"ANALYSIS REPORT: {ticker}")
    print(f"{'='*40}")
    print(f"Data Date:        {last_date}")
    print(f"Last Close:       ${current_close:.2f}")
    print(f"Predicted Close:  ${predicted_price:.2f}")
    
    change = predicted_price - current_close
    percent_change = (change / current_close) * 100 if current_close else 0
    print(f"Change:           {change:.2f} ({percent_change:.2f}%)")
    
    print(f"Recent Volatility:{last_row['Volatility_10']:.4f}")
    print(f"Recent Volume:    {int(last_row['Volume']):,}") 
    print(f"{'='*40}")

    plot_stock_analysis(df_feat, ticker, predicted_price, last_date)

    result = {
        "ticker": ticker,
        "last_date": last_date,
        "last_close": current_close,
        "predicted_next_close": predicted_price,
    }

    return result


In [22]:
def training_pipeline(training_ticker=TRAINING_TICKER):
    print(f"\nStarting training pipeline on {training_ticker}...")
    df = fetch_data(training_ticker)
    if df.empty:
        print("Training failed: no training data.")
        return None, None

    df_feat = create_features(df, n_lookback=N_DAYS_LOOKBACK, predict_shift=PREDICT_SHIFT)
    if df_feat.empty:
        print("Training failed: not enough rows after feature creation.")
        return None, None
   
    X, y = prepare_xy_from_df(df_feat)

    model, scaler = train_model(X, y)
    return model, scaler

In [23]:
def main():
    print("Welcome to the Stock Predictor.")
    print("This tool uses Ridge Regression to predict the next closing price.")
    
    model, scaler = load_model_and_scaler()
    
    if model is None or scaler is None:
        print("No existing model/scaler found. Training model now...")
        model, scaler = training_pipeline()
        if model is None:
            print("Training pipeline failed. Exiting.")
            return
    else:
        print(f"Found existing model '{MODEL_FILE}' and scaler '{SCALER_FILE}'. Loaded them.")

    while True:
        print("\n------------------------------------------------")
        print("Enter a ticker to predict (e.g., NVDA, AAPL, GOOGL)")
        print("Commands: 'train' (re-train model), 'exit' (quit)")
        print("------------------------------------------------")
        user_input = input("Ticker/Command: ").strip()

        if not user_input:
            continue

        cmd = user_input.strip().lower()
        if cmd == "exit":
            print("Exiting.")
            break
        elif cmd == "train":
            print(f"Retraining model on default ticker: {TRAINING_TICKER}")
            model, scaler = training_pipeline(TRAINING_TICKER)
            if model is None:
                print("Retraining failed.")
            continue
        else:
            ticker = user_input.upper()
            predict_next_close_for_ticker(ticker, model, scaler)

Welcome to the Stock Predictor.
This tool uses Ridge Regression to predict the next closing price.
No existing model/scaler found. Training model now...

Starting training pipeline on MSFT...
Fetching data for MSFT from 2015-01-01 to today...


NameError: name 'prepare_xy_from_df' is not defined