In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
# Only import what's available
try:
    from statsmodels.tsa.arima.model import ARIMA
except ImportError:
    print("statsmodels not available, ARIMA will not work")
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

ModuleNotFoundError: No module named 'yfinance'

In [None]:
"""
Stock Price Prediction - Data Collection and Preprocessing
This script downloads historical stock data and prepares it for model training
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import os
import pickle

# Create plots directory if it doesn't exist
os.makedirs('plots', exist_ok=True)
os.makedirs('data', exist_ok=True)

# Configuration
TICKER = "AAPL"
START_DATE = "2018-01-01"
END_DATE = datetime.now().strftime('%Y-%m-%d')
SEQUENCE_LENGTH = 60  # 60 days of history for prediction

print(f"Downloading data for {TICKER} from {START_DATE} to {END_DATE}")

# Download data
try:
    data = yf.download(TICKER, start=START_DATE, end=END_DATE)
    print(f"Downloaded {len(data)} days of data")
    
    # Basic data exploration
    print("\nData Overview:")
    print(data.info())
    print("\nDescriptive Statistics:")
    print(data.describe())

    # Check for missing values
    missing_values = data.isnull().sum()
    print("\nMissing Values:")
    print(missing_values)
    
    # Plot the closing prices
    plt.figure(figsize=(12, 6))
    plt.plot(data['Close'])
    plt.title(f'{TICKER} Stock Price')
    plt.xlabel('Date')
    plt.ylabel('Price (USD)')
    plt.grid(True)
    plt.savefig('plots/stock_price_plot.png')
    plt.close()
    
    # Add technical indicators
    def add_technical_indicators(df):
        print("Adding technical indicators...")
        # Moving averages
        df['MA20'] = df['Close'].rolling(window=20).mean()
        df['MA50'] = df['Close'].rolling(window=50).mean()
        
        # Bollinger Bands
        df['20d_std'] = df['Close'].rolling(window=20).std()
        df['upper_band'] = df['MA20'] + (df['20d_std'] * 2)
        df['lower_band'] = df['MA20'] - (df['20d_std'] * 2)
        
        # RSI (14-day)
        delta = df['Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        df['RSI'] = 100 - (100 / (1 + rs))
        
        # MACD
        df['EMA12'] = df['Close'].ewm(span=12, adjust=False).mean()
        df['EMA26'] = df['Close'].ewm(span=26, adjust=False).mean()
        df['MACD'] = df['EMA12'] - df['EMA26']
        df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
        
        return df

    data = add_technical_indicators(data)
    
    # Plot some technical indicators
    plt.figure(figsize=(12, 8))
    
    plt.subplot(3, 1, 1)
    plt.plot(data['Close'], label='Close Price')
    plt.plot(data['MA20'], label='20-day MA')
    plt.plot(data['MA50'], label='50-day MA')
    plt.title(f'{TICKER} Price and Moving Averages')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(3, 1, 2)
    plt.plot(data['RSI'], label='RSI')
    plt.axhline(y=70, color='r', linestyle='--')
    plt.axhline(y=30, color='g', linestyle='--')
    plt.title('Relative Strength Index (RSI)')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(3, 1, 3)
    plt.plot(data['MACD'], label='MACD')
    plt.plot(data['Signal_Line'], label='Signal Line')
    plt.title('MACD and Signal Line')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('plots/technical_indicators.png')
    plt.close()
    
    # Remove NaN values
    data.dropna(inplace=True)
    print(f"Data after removing NaN values: {len(data)} rows")
    
    # Save the preprocessed data
    data.to_csv('data/preprocessed_data.csv')
    
    # Feature selection
    features = ['Close', 'Volume', 'MA20', 'MA50', 'RSI', 'MACD', 'upper_band', 'lower_band']
    target = 'Close'
    
    # Scale the features
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data[features])
    scaled_df = pd.DataFrame(scaled_data, columns=features, index=data.index)
    
    # Create sequences for LSTM
    def create_sequences(data, seq_length):
        X, y = [], []
        for i in range(len(data) - seq_length):
            X.append(data[i:i+seq_length])
            y.append(data[i+seq_length, 0])  # 'Close' is the first column
        return np.array(X), np.array(y)
    
    # Create sequences
    X, y = create_sequences(scaled_data, SEQUENCE_LENGTH)
    print(f"Created {len(X)} sequences of length {SEQUENCE_LENGTH}")
    
    # Split data into training and testing sets
    split_ratio = 0.8
    split_index = int(split_ratio * len(X))
    X_train, X_test = X[:split_index], X[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]
    
    print(f"Training data shape: {X_train.shape}, {y_train.shape}")
    print(f"Testing data shape: {X_test.shape}, {y_test.shape}")
    
    # Save processed data
    np.save('data/X_train.npy', X_train)
    np.save('data/y_train.npy', y_train)
    np.save('data/X_test.npy', X_test)
    np.save('data/y_test.npy', y_test)
    
    # Also save the scaler for later use
    with open('data/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    
    print("Data preprocessing completed successfully!")
    print("Files saved in 'data' directory")
    
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
# Download stock data
print(f"Downloading data for {TICKER} from {START_DATE} to {END_DATE}...")
data = yf.download(TICKER, start=START_DATE, end=END_DATE)

# Basic info
print(f"Downloaded {len(data)} days of data")
print("\nFirst 5 rows:")
display(data.head())

print("\nDescriptive Statistics:")
display(data.describe())

# Check for missing values
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)

In [None]:
# Plot the closing prices
plt.figure(figsize=(14, 7))
plt.plot(data['Close'])
plt.title(f'{TICKER} Stock Price History')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.grid(True)
plt.show()

# Volume traded
plt.figure(figsize=(14, 7))
plt.bar(data.index, data['Volume'])
plt.title(f'{TICKER} Trading Volume')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.grid(True)
plt.show()

In [None]:
# Add technical indicators
def add_technical_indicators(df):
    print("Adding technical indicators...")
    # Moving averages
    df['MA20'] = df['Close'].rolling(window=20).mean()
    df['MA50'] = df['Close'].rolling(window=50).mean()
    
    # Bollinger Bands
    df['20d_std'] = df['Close'].rolling(window=20).std()
    df['upper_band'] = df['MA20'] + (df['20d_std'] * 2)
    df['lower_band'] = df['MA20'] - (df['20d_std'] * 2)
    
    # RSI (14-day)
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # MACD
    df['EMA12'] = df['Close'].ewm(span=12, adjust=False).mean()
    df['EMA26'] = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = df['EMA12'] - df['EMA26']
    df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
    
    return df

# Apply technical indicators
data = add_technical_indicators(data)

In [None]:
# Plot technical indicators
plt.figure(figsize=(16, 12))

# Plot 1: Close price, MA20, MA50
plt.subplot(3, 1, 1)
plt.plot(data.index, data['Close'], label='Close Price')
plt.plot(data.index, data['MA20'], label='20-day MA')
plt.plot(data.index, data['MA50'], label='50-day MA')
plt.title(f'{TICKER} Close Price and Moving Averages')
plt.legend()
plt.grid(True)

# Plot 2: RSI
plt.subplot(3, 1, 2)
plt.plot(data.index, data['RSI'], label='RSI')
plt.axhline(y=70, color='r', linestyle='--')
plt.axhline(y=30, color='g', linestyle='--')
plt.title('Relative Strength Index (RSI)')
plt.legend()
plt.grid(True)

# Plot 3: MACD
plt.subplot(3, 1, 3)
plt.plot(data.index, data['MACD'], label='MACD')
plt.plot(data.index, data['Signal_Line'], label='Signal Line')
plt.title('MACD and Signal Line')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Remove NaN values
data.dropna(inplace=True)
print(f"Data after removing NaN values: {len(data)} rows")

# Feature selection
features = ['Close', 'Volume', 'MA20', 'MA50', 'RSI', 'MACD', 'upper_band', 'lower_band']
target = 'Close'

# Scale the features
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[features])
scaled_df = pd.DataFrame(scaled_data, columns=features, index=data.index)

# Create sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, 0])  # 'Close' is the first column
    return np.array(X), np.array(y)

# Create sequences
X, y = create_sequences(scaled_data, SEQUENCE_LENGTH)
print(f"Created {len(X)} sequences of length {SEQUENCE_LENGTH}")

# Split data into training and testing sets
split_index = int(SPLIT_RATIO * len(X))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

# Also prepare data for ARIMA (non-scaled)
train_data = data['Close'][:int(SPLIT_RATIO * len(data))]
test_data = data['Close'][int(SPLIT_RATIO * len(data)):]

# Save test dates for later visualization
test_dates = data.index[int(SPLIT_RATIO * len(data)) + SEQUENCE_LENGTH:]

In [None]:
if ARIMA_AVAILABLE:
    # Check stationarity
    def check_stationarity(timeseries):
        result = adfuller(timeseries)
        print('ADF Statistic: %f' % result[0])
        print('p-value: %f' % result[1])
        print('Critical Values:')
        for key, value in result[4].items():
            print('\t%s: %.3f' % (key, value))
        
        # If p-value is less than 0.05, data is stationary
        if result[1] <= 0.05:
            print("The time series is stationary")
        else:
            print("The time series is not stationary")

    # Test stationarity
    print("Testing stationarity of time series...")
    check_stationarity(train_data)

    # Determine if differencing is needed
    if adfuller(train_data)[1] > 0.05:
        print("\nApplying differencing to make the series stationary...")
        train_diff = train_data.diff().dropna()
        print("After differencing:")
        check_stationarity(train_diff)
        diff_order = 1
    else:
        train_diff = train_data
        diff_order = 0

    # Fit ARIMA model
    print("\nTraining ARIMA model...")
    try:
        model = ARIMA(train_data, order=(5, diff_order, 1))
        model_fit = model.fit()
        print("ARIMA model summary:")
        print(model_fit.summary())

        # Make predictions
        print("\nMaking predictions with ARIMA model...")
        arima_predictions = model_fit.forecast(steps=len(test_data))
        
        # Calculate error metrics
        arima_rmse = math.sqrt(mean_squared_error(test_data, arima_predictions))
        arima_mae = mean_absolute_error(test_data, arima_predictions)
        arima_mape = np.mean(np.abs((test_data - arima_predictions) / test_data)) * 100
        
        print(f"ARIMA RMSE: {arima_rmse:.2f}")
        print(f"ARIMA MAE: {arima_mae:.2f}")
        print(f"ARIMA MAPE: {arima_mape:.2f}%")

        # Plot results
        plt.figure(figsize=(14, 7))
        plt.plot(train_data.index[-30:], train_data[-30:], label='Training Data')
        plt.plot(test_data.index, test_data, label='Actual Price')
        plt.plot(test_data.index, arima_predictions, label='ARIMA Predictions', color='red')
        plt.title(f'ARIMA: Stock Price Prediction for {TICKER}')
        plt.xlabel('Date')
        plt.ylabel('Price (USD)')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        # Make future predictions with ARIMA
        print("\nPredicting future prices with ARIMA...")
        arima_future = model_fit.forecast(steps=FUTURE_DAYS)
        future_dates = pd.date_range(start=data.index[-1], periods=FUTURE_DAYS+1)[1:]
        
        arima_available_for_comparison = True
        
    except Exception as e:
        print(f"Error in ARIMA model: {e}")
        arima_available_for_comparison = False
        
else:
    print("Skipping ARIMA model due to missing dependencies")
    arima_available_for_comparison = False

In [None]:
if LSTM_AVAILABLE:
    print("Building LSTM model...")
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dense(25))
    model.add(Dense(1))

    # Compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.summary()

    # Train model with early stopping
    print("\nTraining LSTM model...")
    early_stop = EarlyStopping(monitor='val_loss', patience=10)
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=1
    )

    # Plot training history
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('LSTM Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Make predictions
    print("\nMaking predictions with LSTM model...")
    y_pred = model.predict(X_test)

    # Inverse transform predictions to original scale
    y_pred_full = np.zeros((len(y_pred), X_train.shape[2]))
    y_pred_full[:, 0] = y_pred.flatten()
    y_test_full = np.zeros((len(y_test), X_train.shape[2]))
    y_test_full[:, 0] = y_test.flatten()

    y_pred_original = scaler.inverse_transform(y_pred_full)[:, 0]
    y_test_original = scaler.inverse_transform(y_test_full)[:, 0]

    # Calculate metrics
    lstm_rmse = math.sqrt(mean_squared_error(y_test_original, y_pred_original))
    lstm_mae = mean_absolute_error(y_test_original, y_pred_original)
    lstm_mape = np.mean(np.abs((y_test_original - y_pred_original) / y_test_original)) * 100
    
    print(f"LSTM RMSE: {lstm_rmse:.2f}")
    print(f"LSTM MAE: {lstm_mae:.2f}")
    print(f"LSTM MAPE: {lstm_mape:.2f}%")

    # Plot results
    plt.figure(figsize=(14, 7))
    plt.plot(test_dates, y_test_original, label='Actual Price')
    plt.plot(test_dates, y_pred_original, label='LSTM Predictions', color='green')
    plt.title(f'LSTM: Stock Price Prediction for {TICKER}')
    plt.xlabel('Date')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # Generate future predictions
    print("\nPredicting future prices with LSTM...")
    last_sequence = X_test[-1:].copy()
    lstm_future = []

    for _ in range(FUTURE_DAYS):
        # Make prediction with current sequence
        next_pred = model.predict(last_sequence)
        lstm_future.append(next_pred[0, 0])
        
        # Create a new sequence by:
        # 1. Dropping the first time step
        # 2. Appending the new prediction
        last_seq = last_sequence[0]
        new_seq = np.append(last_seq[1:, :], [[next_pred[0, 0]] + [0] * (last_seq.shape[1]-1)], axis=0)
        last_sequence[0] = new_seq
    
    # Scale back the predictions
    lstm_future_array = np.array(lstm_future).reshape(-1, 1)
    lstm_future_full = np.zeros((len(lstm_future), X_train.shape[2]))
    lstm_future_full[:, 0] = lstm_future_array.flatten()
    lstm_future_original = scaler.inverse_transform(lstm_future_full)[:, 0]
    
    # Create future dates
    future_dates = pd.date_range(start=test_dates[-1], periods=FUTURE_DAYS+1)[1:]
    
    lstm_available_for_comparison = True
    
else:
    print("Skipping LSTM model due to missing dependencies")
    lstm_available_for_comparison = False

In [None]:
# Future predictions visualization
plt.figure(figsize=(14, 7))
plt.plot(data['Close'][-60:].index, data['Close'][-60:], label='Historical Price', color='blue')

if 'arima_available_for_comparison' in locals() and arima_available_for_comparison:
    plt.plot(future_dates, arima_future, label='ARIMA Forecast', color='red', linestyle='--')
    
if 'lstm_available_for_comparison' in locals() and lstm_available_for_comparison:
    plt.plot(future_dates, lstm_future_original, label='LSTM Forecast', color='green', linestyle='-.')

plt.title(f'{TICKER} Stock Price Forecast (Next {FUTURE_DAYS} Days)')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.grid(True)
plt.show()

# Compare model performance if both are available
if ('arima_available_for_comparison' in locals() and arima_available_for_comparison and
    'lstm_available_for_comparison' in locals() and lstm_available_for_comparison):
    
    # Create comparison dataframe
    comparison_results = {
        'ARIMA_RMSE': arima_rmse,
        'ARIMA_MAE': arima_mae,
        'ARIMA_MAPE': arima_mape,
        'LSTM_RMSE': lstm_rmse,
        'LSTM_MAE': lstm_mae,
        'LSTM_MAPE': lstm_mape,
    }
    
    # Compare models
    if lstm_rmse < arima_rmse:
        comparison_results['Better_Model'] = 'LSTM'
        print("\nLSTM model performed better based on RMSE")
    else:
        comparison_results['Better_Model'] = 'ARIMA'
        print("\nARIMA model performed better based on RMSE")
    
    comparison_df = pd.DataFrame([comparison_results])
    display(comparison_df)

    # Create dataframe for future predictions
    future_df = pd.DataFrame({
        'Date': future_dates,
        'ARIMA_Forecast': arima_future,
        'LSTM_Forecast': lstm_future_original
    }).set_index('Date')
    
    print("\nFuture price predictions:")
    display(future_df.head())

    # Save the predictions if needed
    future_df.to_csv('results/future_predictions_combined.csv')
    print("Future predictions saved to 'results/future_predictions_combined.csv'")
    
else:
    print("\nCould not compare models because one or both models were not available")
    
    # Check if at least one model is available for future predictions
    if 'lstm_available_for_comparison' in locals() and lstm_available_for_comparison:
        future_df = pd.DataFrame({
            'Date': future_dates,
            'LSTM_Forecast': lstm_future_original
        }).set_index('Date')
        
        print("\nFuture price predictions from LSTM:")
        display(future_df.head())
        
    elif 'arima_available_for_comparison' in locals() and arima_available_for_comparison:
        future_df = pd.DataFrame({
            'Date': future_dates,
            'ARIMA_Forecast': arima_future
        }).set_index('Date')
        
        print("\nFuture price predictions from ARIMA:")
        display(future_df.head())

In [2]:
# Print summary
print(f"Stock Price Prediction Project - Summary")
print(f"----------------------------------------")
print(f"Stock Ticker: {TICKER}")
print(f"Date Range: {START_DATE} to {END_DATE}")
print(f"Training/Testing Split: {SPLIT_RATIO*100}%/{(1-SPLIT_RATIO)*100}%")

if 'arima_available_for_comparison' in locals() and arima_available_for_comparison:
    print(f"\nARIMA Model Performance:")
    print(f"  RMSE: {arima_rmse:.2f}")
    print(f"  MAE: {arima_mae:.2f}")
    print(f"  MAPE: {arima_mape:.2f}%")

if 'lstm_available_for_comparison' in locals() and lstm_available_for_comparison:
    print(f"\nLSTM Model Performance:")
    print(f"  RMSE: {lstm_rmse:.2f}")
    print(f"  MAE: {lstm_mae:.2f}")
    print(f"  MAPE: {lstm_mape:.2f}%")

# Save the notebook for future reference
print("\nRemember to save your notebook!")

Stock Price Prediction Project - Summary
----------------------------------------


NameError: name 'TICKER' is not defined