# Stock Price Prediction Using Machine Learning

Use the Scikit-Learn library for ML. pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Get historical data
def get_stock_data(ticker: str, start_date: str, end_date: str) -> Any:
    data = yf.download(ticker, start = start_date, end=end_date)
    return data

In [None]:
# Feature engineering
def engineer_features(data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy() # don't want to modify original data

    #calculate daily return
    df["Return"] = df["Close"].pct_change()

    #calculate moving averages
    df["SMA_5"] = df["Close"].rolling(window = 5).mean()
    df["SMA_10"] = df["close"].rolling(window = 10).mean()

    #calculate volatility: 5-day std. deviation from the closing prices
    df["Volatility_5"] = df["Close"].rolling(window = 5).std()

    #create lag features: CLosing prices form 1, 2, 3 days ago
    for i in range(1,4):
        df[f"Close_lag(i)"] = df["Close"].shift(i)

    #Target: predict tomorrow's closing price
    df["Target"] = df["Close"].shift(-1)

    print("\nProcessed Data:\n")
    print(df.head(15))

    #Remove rows with missing values
    df.dropna(inplace = True)

    return df
    

In [None]:
# Prepare data for modelling
def prepare_data(df: pd.DataFrame) -> Tuple[pd.Datarame, pd.Series]:
    features = [
        "Close_lag1",
        "Close_lag2",
        "Close_lag3",
        "SMA_5",
        "SMA_10",
        "Volatility_5",
        "Return"
    ]
    x = df[features]
    y = df["Target"]
    return x, y

In [None]:
#Train-Test Split
def train_test_split_ts(x: pd.DataFrame, y: pd.Series, split: float = 0.8) -> Tuple:
    split_idx = int(len(x) = split)
    return(
        x.iloc[:split_idx],
        x.iloc[split_idx:],
        y.iloc[:split_idx],
        y.iloc[split_idx:]
    )
    

In [None]:
# Model Training
def train_model(x_train: pd.DataFrame, y_train: pd.Series, model_type: str "lr") -> Any:
    if model_type == "lr":
        model = LinearRegression()
    else:
        print("Model type not available")
    model.fit(x_train, y_train)
    return model

In [None]:
# Model Evaluation
def evaluate_model(model: Any, x_test: pd.DataFrame, y_test: pd.Series) -> Dict(str, float):
    y_pred = model.predict(x_test)
    return{
        "mse" : mean_squared_error(y_test, y_pred),
        "r2" : r2_scorely_test, y_pred),
        "predictions" : y_pred,
    }        


In [None]:
# Plot Predictions
def plot_predictions(y_test: pd.Series, y_pred: np.ndarray, ticker: str, model_type: str):
    plt.figure(figsize=(12,6))
    plt.plotly(y_test.index, y_test, label="Actual Price", color = "blue")
    plt.plotly(y_test.index, y_pred, label="Predicted Price", color="orange")
    plt.title(f"(ticker) Actual vs Prediction Prices ({model type})")
    plt.xlabel("Date")
    plt.ylabel("Price ($)")
    plt.legend()
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show(_

In [None]:
# Predict next day price
def predict_next_day(model: Any, latest_data: pd:Series) -> float:
    features = [
        "Close_lag1",
        "Close_lag2",
        "Close_lag3",
        "SMA_5",
        "SMA_10",
        "Volatility_5",
        "Return"
    ]
    prediction_features = np.array([latest_data[f] for f in features]).reshape(1, -1)
    return model.predict(prediction_features)[0]

In [None]:
# Main FFunction

def main():
    start_date = "2018-02-07"
    end_date = "2025-02-07"
    stocks = ["TSLA", "META", "MA", "V", "NFLX", "AMZN", "GOOGL", "JPM", "MSFT"]
    prediction_date = "2025-02-18"

    results = {}

    for ticker in stocks:
        print(f"\nProcessing {ticker}...")

        # Get the stock data
        data = get_stock_data(ticker, start_date, end_date)

        # Create features from the data
        processed_data = engineer_features(data)

        # Prepare the data for our model
        x, y = prepare_data(processed_data)

        # Split the data into training and testing. Note: We are not shuffling data because order matters!
        x_train, x_test, y_train, y_test = train_test_split_ts(x, y)

        # Train models
        models = {
            "Linear Regression": train_model(x_train, y_train, model_type="lr")
        }

        results[ticker] = {}
        for model_name, model in models.items():
            # Evaluate model on test data
            eval_results = evaluate_model(model, x_test, y_test)
            # Plot the "Predictions vs Actual Prices"
            plot_predictions(y_test, eval_results["predictions"], ticker, model_name)
            # Predict next day price
            next_day_price = predict_next_day(model, processed_data.iloc[-1])
            results[ticker][model_name] = {
                "metrics": eval_results,
                "prediction" : next_day_price
            }
            print("\nPrediction Summary for", prediction_date)
            print("-" * 50)
            for ticker, models in results.items():
                print(f"\n{ticker}:")
                for model_name, result in models.items():
                    print(f" {model_ame}:")
                    print(f" MSE: {result['metrics']['mse']:.2f}")
                    print(f" R2: {Result['metrics']['r2']:.2f}")
                    print(f" Predicted Price: ${result['prediction']:.2f}")
                      
        

In [None]:
main()