# Stock Price Prediction with Machine Learning

This notebook demonstrates how to build a machine learning model to predict stock prices using historical data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Collection

We'll use Yahoo Finance API to fetch historical stock data.

In [None]:
# Define stock symbol and date range
STOCK_SYMBOL = 'AAPL'  # Apple Inc.
START_DATE = '2020-01-01'
END_DATE = datetime.now().strftime('%Y-%m-%d')

# Fetch stock data
print(f"Fetching data for {STOCK_SYMBOL} from {START_DATE} to {END_DATE}")
stock_data = yf.download(STOCK_SYMBOL, start=START_DATE, end=END_DATE)

print(f"Data shape: {stock_data.shape}")
print(f"Date range: {stock_data.index.min()} to {stock_data.index.max()}")
stock_data.head()

## 2. Data Exploration

In [None]:
# Basic statistics
print("Dataset Info:")
print(stock_data.info())
print("\nBasic Statistics:")
print(stock_data.describe())

# Check for missing values
print("\nMissing Values:")
print(stock_data.isnull().sum())

In [None]:
# Visualize stock price trends
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle(f'{STOCK_SYMBOL} Stock Analysis', fontsize=16)

# Price trends
axes[0, 0].plot(stock_data.index, stock_data['Close'], label='Close Price', linewidth=1)
axes[0, 0].set_title('Close Price Over Time')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Volume
axes[0, 1].plot(stock_data.index, stock_data['Volume'], color='orange', linewidth=1)
axes[0, 1].set_title('Trading Volume Over Time')
axes[0, 1].set_ylabel('Volume')
axes[0, 1].grid(True, alpha=0.3)

# Price distribution
axes[1, 0].hist(stock_data['Close'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[1, 0].set_title('Close Price Distribution')
axes[1, 0].set_xlabel('Price ($)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(True, alpha=0.3)

# Daily returns
daily_returns = stock_data['Close'].pct_change().dropna()
axes[1, 1].hist(daily_returns, bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1, 1].set_title('Daily Returns Distribution')
axes[1, 1].set_xlabel('Daily Return')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Feature Engineering

Create technical indicators and features for prediction.

In [None]:
def create_features(data):
    df = data.copy()
    
    # Flatten column names if they are MultiIndex (from yfinance)
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(0)
    
    # Price features
    df['Price_Range'] = df['High'] - df['Low']
    df['Price_Change'] = df['Close'] - df['Open']
    df['Price_Change_Pct'] = (df['Price_Change'] / df['Open']) * 100
    
    # Moving averages
    df['MA_5'] = df['Close'].rolling(window=5).mean()
    df['MA_10'] = df['Close'].rolling(window=10).mean()
    df['MA_20'] = df['Close'].rolling(window=20).mean()
    df['MA_50'] = df['Close'].rolling(window=50).mean()
    
    # Relative position to moving averages
    df['Close_MA5_Ratio'] = df['Close'] / df['MA_5']
    df['Close_MA20_Ratio'] = df['Close'] / df['MA_20']
    
    # Volatility (rolling standard deviation)
    df['Volatility_5'] = df['Close'].rolling(window=5).std()
    df['Volatility_20'] = df['Close'].rolling(window=20).std()
    
    # RSI (Relative Strength Index)
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # Bollinger Bands
    df['BB_Middle'] = df['Close'].rolling(window=20).mean()
    df['BB_Upper'] = df['BB_Middle'] + 2 * df['Close'].rolling(window=20).std()
    df['BB_Lower'] = df['BB_Middle'] - 2 * df['Close'].rolling(window=20).std()
    df['BB_Position'] = (df['Close'] - df['BB_Lower']) / (df['BB_Upper'] - df['BB_Lower'])
    
    # Lagged features
    for lag in [1, 2, 3, 5, 10]:
        df[f'Close_Lag_{lag}'] = df['Close'].shift(lag)
        df[f'Volume_Lag_{lag}'] = df['Volume'].shift(lag)
    
    # Target variable (next day's closing price)
    df['Target'] = df['Close'].shift(-1)
    
    return df

# Create features
df_features = create_features(stock_data)
print(f"Features created. Shape: {df_features.shape}")
print(f"\nFeature columns: {df_features.columns.tolist()}")

## 4. Data Preprocessing

In [None]:
# Remove rows with NaN values
df_clean = df_features.dropna()
print(f"After removing NaN values: {df_clean.shape}")

# Select features for modeling
feature_columns = [
    'Open', 'High', 'Low', 'Volume',
    'Price_Range', 'Price_Change', 'Price_Change_Pct',
    'MA_5', 'MA_10', 'MA_20', 'MA_50',
    'Close_MA5_Ratio', 'Close_MA20_Ratio',
    'Volatility_5', 'Volatility_20',
    'RSI', 'BB_Position',
    'Close_Lag_1', 'Close_Lag_2', 'Close_Lag_3', 'Close_Lag_5', 'Close_Lag_10',
    'Volume_Lag_1', 'Volume_Lag_2', 'Volume_Lag_3', 'Volume_Lag_5', 'Volume_Lag_10'
]

X = df_clean[feature_columns]
y = df_clean['Target']

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled successfully!")

## 5. Model Training

Train multiple machine learning models and compare their performance.

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train model
    if name == 'Linear Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'MSE': mse,
        'MAE': mae,
        'RMSE': rmse,
        'R²': r2
    }
    
    print(f"  MSE: {mse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²: {r2:.4f}")
    print()

print("Model training completed!")

## 6. Model Evaluation and Visualization

In [None]:
# Create evaluation metrics table
metrics_df = pd.DataFrame({
    'Model': list(results.keys()),
    'MSE': [results[model]['MSE'] for model in results],
    'MAE': [results[model]['MAE'] for model in results],
    'RMSE': [results[model]['RMSE'] for model in results],
    'R²': [results[model]['R²'] for model in results]
})

print("Model Performance Comparison:")
print(metrics_df.round(4))

In [None]:
# Visualize predictions vs actual values
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Model Predictions vs Actual Values', fontsize=16)

# Time series comparison
test_dates = df_clean.index[-len(y_test):]
axes[0, 0].plot(test_dates, y_test.values, label='Actual', linewidth=2, alpha=0.8)
for name, result in results.items():
    axes[0, 0].plot(test_dates, result['predictions'], label=f'{name} Prediction', linewidth=1, alpha=0.7)
axes[0, 0].set_title('Time Series Prediction Comparison')
axes[0, 0].set_ylabel('Stock Price ($)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Scatter plot for Linear Regression
lr_pred = results['Linear Regression']['predictions']
axes[0, 1].scatter(y_test, lr_pred, alpha=0.6, color='blue')
axes[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 1].set_title('Linear Regression: Predicted vs Actual')
axes[0, 1].set_xlabel('Actual Price ($)')
axes[0, 1].set_ylabel('Predicted Price ($)')
axes[0, 1].grid(True, alpha=0.3)

# Scatter plot for Random Forest
rf_pred = results['Random Forest']['predictions']
axes[1, 0].scatter(y_test, rf_pred, alpha=0.6, color='green')
axes[1, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1, 0].set_title('Random Forest: Predicted vs Actual')
axes[1, 0].set_xlabel('Actual Price ($)')
axes[1, 0].set_ylabel('Predicted Price ($)')
axes[1, 0].grid(True, alpha=0.3)

# Residuals plot
rf_residuals = y_test.values - rf_pred
axes[1, 1].scatter(rf_pred, rf_residuals, alpha=0.6, color='red')
axes[1, 1].axhline(y=0, color='black', linestyle='--')
axes[1, 1].set_title('Random Forest: Residuals Plot')
axes[1, 1].set_xlabel('Predicted Price ($)')
axes[1, 1].set_ylabel('Residuals ($)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Feature Importance Analysis

In [None]:
# Feature importance for Random Forest
rf_model = results['Random Forest']['model']
feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['Importance'], color='skyblue')
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances (Random Forest)')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

## 8. Make Future Predictions

In [None]:
# Use the best performing model for future predictions
best_model_name = metrics_df.loc[metrics_df['R²'].idxmax(), 'Model']
best_model = results[best_model_name]['model']

print(f"Best performing model: {best_model_name}")
print(f"R² Score: {results[best_model_name]['R²']:.4f}")

# Get the last row of features for prediction
last_features = X.iloc[-1:]
print(f"\nLast known closing price: ${df_clean['Close'].iloc[-1]:.2f}")

# Make prediction
if best_model_name == 'Linear Regression':
    last_features_scaled = scaler.transform(last_features)
    next_day_prediction = best_model.predict(last_features_scaled)[0]
else:
    next_day_prediction = best_model.predict(last_features)[0]

print(f"Predicted next day closing price: ${next_day_prediction:.2f}")
print(f"Predicted change: ${next_day_prediction - df_clean['Close'].iloc[-1]:.2f}")
print(f"Predicted change (%): {((next_day_prediction - df_clean['Close'].iloc[-1]) / df_clean['Close'].iloc[-1] * 100):.2f}%")

## 9. Model Summary and Conclusions

In [None]:
print("=" * 60)
print("STOCK PRICE PREDICTION MODEL SUMMARY")
print("=" * 60)
print(f"Stock Symbol: {STOCK_SYMBOL}")
print(f"Data Period: {START_DATE} to {END_DATE}")
print(f"Total Data Points: {len(df_clean)}")
print(f"Number of Features: {len(feature_columns)}")
print(f"Training Set Size: {len(X_train)}")
print(f"Test Set Size: {len(X_test)}")
print()
print("MODEL PERFORMANCE:")
print(metrics_df.to_string(index=False))
print()
print(f"Best Model: {best_model_name}")
print(f"Best R² Score: {results[best_model_name]['R²']:.4f}")
print(f"Best RMSE: ${results[best_model_name]['RMSE']:.2f}")
print()
print("NEXT DAY PREDICTION:")
print(f"Current Price: ${df_clean['Close'].iloc[-1]:.2f}")
print(f"Predicted Price: ${next_day_prediction:.2f}")
print(f"Predicted Change: ${next_day_prediction - df_clean['Close'].iloc[-1]:.2f}")
print()
print("IMPORTANT DISCLAIMERS:")
print("- This model is for educational purposes only")
print("- Past performance does not guarantee future results")
print("- Stock market predictions are inherently uncertain")
print("- Always consult with financial advisors before making investment decisions")
print("=" * 60)