In [None]:
!pip install scikit-learn
!pip install yfinance

Collecting scikit-learn
  Using cached scikit_learn-1.7.2.tar.gz (7.2 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25l-

In [None]:
# --- Imports ---
# We import the necessary libraries for our analysis.
# yfinance: To download financial data from Yahoo Finance.
# pandas: For data manipulation and analysis.
# scikit-learn: For building and evaluating our machine learning model.
# numpy: For numerical operations, especially for calculating the root of our error metric.

import yfinance as yf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [10]:
# --- Step 1: Prepare the Data ---

# We download historical stock data for Apple ('AAPL').
# The `auto_adjust=True` parameter is a best practice. It automatically adjusts the
# Open, High, Low, and Close prices for corporate actions like stock splits and dividends.
# This gives us a more accurate representation of the stock's true value over time.
data = yf.download('AAPL', start='2022-01-01', end='2023-01-01', auto_adjust=True)

# Our goal is to predict the *next* day's price.
# We create our target column 'Next_Close' by shifting the 'Close' price series up by one day.
data['Next_Close'] = data['Close'].shift(-1)

# This shift creates a `NaN` (Not a Number) value in the last row of our dataset.
# Machine learning models can't handle missing values, so we remove any rows with NaN.
data = data.dropna()

# We define our features (X) and our target (y).
# The features are the input variables our model will learn from.
# Note: When auto_adjust=True, 'Volume' is not included in the main DataFrame, so we use the available price data.
features = ['Open', 'High', 'Low', 'Close']
X = data[features]
y = data['Next_Close'] # The target is the single column we want to predict.

[*********************100%***********************]  1 of 1 completed


In [11]:
# --- Step 2: Split the Data ---

# We split our dataset into a training set and a testing set.
# The model learns patterns from the training set.
# The testing set is held back to evaluate the model's performance on unseen data.
# For time-series data like stock prices, we set `shuffle=False` to ensure
# we train on the past and test on the more recent future, mimicking a real-world scenario.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)




In [12]:
# --- Step 3: Choose & Train a Model ---

# We choose a `LinearRegression` model, which is a simple and fundamental model for regression tasks.
# We create an instance of the model.
model = LinearRegression()

# We "fit" the model to our training data. This is the step where the model "learns" the
# relationship between the features (X_train) and the target (y_train).
model.fit(X_train, y_train)

print("Model training complete!")




Model training complete!


In [13]:
# --- Step 4: Evaluate the Model ---

# Now that the model is trained, we use it to make predictions on the unseen test data.
predictions = model.predict(X_test)

# We evaluate the model by comparing its `predictions` to the actual prices (`y_test`).
# Mean Squared Error (MSE) is a common metric, but its units are squared (e.g., dollars-squared).
# We calculate the Root Mean Squared Error (RMSE) to get an error metric in the same units as the price (dollars).
# A lower RMSE indicates a better fit.
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

print(f"\nModel Evaluation on Test Data:")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")


# --- Step 5: Make a Prediction ---

# Let's see how our model's prediction compares to the actual price for the last day in our test set.
# This gives us a concrete example of the model's performance.
print(f"\nExample Prediction:")
print(f"Actual Price on Last Day of Test Set: ${y_test.iloc[-1]:.2f}")
print(f"Model's Predicted Price for That Day:   ${predictions[-1]:.2f}")


Model Evaluation on Test Data:
Root Mean Squared Error (RMSE): $3.64

Example Prediction:
Actual Price on Last Day of Test Set: $128.00
Model's Predicted Price for That Day:   $128.95
