In [None]:
# %% [markdown]
# # Data Preparation
# This notebook cleans the stock data, adds technical indicators, creates a target column,
# and saves the processed dataset for model training.

# %%
# Import necessary libraries
import sys
from pathlib import Path
import pandas as pd

# Go one folder up from 'notebooks' to reach the main project directory
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from src.dataload import load_stock_data  # src/dataload.py provides load_stock_data



Downloading fresh data...
Downloading AAPL data from 2025-01-01 to 2025-08-18...


  df = yf.download(ticker, start=start_date, end=end_date, interval=interval)
[*********************100%***********************]  1 of 1 completed


✅ Data saved to C:\Users\Steph\Desktop\Projects\stock-price-movement-predictor\data\raw\AAPL_2025-01-01_2025-08-18_daily.csv
Raw data shape: (155, 5)


KeyError: 'Date'

In [5]:
print(df.head())
print(df.columns)


Price            Close        High         Low        Open    Volume
Ticker            AAPL        AAPL        AAPL        AAPL      AAPL
Date                                                                
2025-01-02  242.987411  248.218840  240.964594  248.049428  55740700
2025-01-03  242.499161  243.316252  241.034359  242.499161  40244100
2025-01-06  244.133347  246.455106  242.339711  243.445785  45045600
2025-01-07  241.353226  244.681407  240.496267  242.120491  40856000
2025-01-08  241.841492  242.847929  239.200872  241.064252  37628900
MultiIndex([( 'Close', 'AAPL'),
            (  'High', 'AAPL'),
            (   'Low', 'AAPL'),
            (  'Open', 'AAPL'),
            ('Volume', 'AAPL')],
           names=['Price', 'Ticker'])


In [6]:
# Flatten the multi-level columns
df.columns = [col[0] for col in df.columns]

# Move the Date index into a column (optional but useful)
df = df.reset_index()

# Now confirm
print(df.head())


        Date       Close        High         Low        Open    Volume
0 2025-01-02  242.987411  248.218840  240.964594  248.049428  55740700
1 2025-01-03  242.499161  243.316252  241.034359  242.499161  40244100
2 2025-01-06  244.133347  246.455106  242.339711  243.445785  45045600
3 2025-01-07  241.353226  244.681407  240.496267  242.120491  40856000
4 2025-01-08  241.841492  242.847929  239.200872  241.064252  37628900


In [8]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)


In [9]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    155 non-null    datetime64[ns]
 1   Close   155 non-null    float64       
 2   High    155 non-null    float64       
 3   Low     155 non-null    float64       
 4   Open    155 non-null    float64       
 5   Volume  155 non-null    int64         
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 7.4 KB


In [3]:
import os
print("Current working directory:", os.getcwd())
print("Contents:", os.listdir())


Current working directory: c:\Users\Steph\Desktop\Projects\stock-price-movement-predictor\notebooks
Contents: ['datapreparation.ipynb', 'exploration.ipynb']


In [21]:
from src.dataload import load_stock_data

df = load_stock_data("AAPL", refresh=True)
print(df.shape)
df.head()


  df = yf.download(ticker, start=start_date, end=end_date, interval=interval)
[*********************100%***********************]  1 of 1 completed

Downloading fresh data...
Downloading AAPL data from 2025-01-01 to 2025-08-18...
✅ Data saved to C:\Users\Steph\Desktop\Projects\stock-price-movement-predictor\data\raw\AAPL_2025-01-01_2025-08-18_daily.csv
(155, 5)





Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-01-02,242.987411,248.21884,240.964594,248.049428,55740700
2025-01-03,242.499161,243.316252,241.034359,242.499161,40244100
2025-01-06,244.133347,246.455106,242.339711,243.445785,45045600
2025-01-07,241.353226,244.681407,240.496267,242.120491,40856000
2025-01-08,241.841492,242.847929,239.200872,241.064252,37628900


In [25]:
from src.dataload import load_stock_data

df = load_stock_data("AAPL", refresh=False)
print(df.shape)
print(df.dtypes)
print(df["Close"].head())
print(type(df["Close"].iloc[0]))


  df = yf.download(ticker, start=start_date, end=end_date, interval=interval)
[*********************100%***********************]  1 of 1 completed

Downloading fresh data...
Downloading AAPL data from 2025-01-01 to 2025-08-18...
✅ Data saved to C:\Users\Steph\Desktop\Projects\stock-price-movement-predictor\data\raw\AAPL_2025-01-01_2025-08-18_daily.csv
(155, 5)
Price   Ticker
Close   AAPL      float64
High    AAPL      float64
Low     AAPL      float64
Open    AAPL      float64
Volume  AAPL        int64
dtype: object
Ticker            AAPL
Date                  
2025-01-02  242.987411
2025-01-03  242.499161
2025-01-06  244.133347
2025-01-07  241.353226
2025-01-08  241.841492
<class 'pandas.core.series.Series'>





In [30]:
from src.dataload import load_stock_data
from src.featureengineering import add_technical_indicators, prepare_features
import pandas as pd
import numpy as np

# Load data
df = load_stock_data("AAPL", refresh=True)

# Debug the structure before processing
print("=== BEFORE PROCESSING ===")
print(f"df type: {type(df)}")
print(f"df columns: {df.columns}")
print(f"df['Close'] type: {type(df['Close'])}")
print(f"df['Close'] shape: {df['Close'].shape if hasattr(df['Close'], 'shape') else 'N/A'}")
print(f"df['Close'] head:\n{df['Close'].head()}")

# Test add_technical_indicators directly
print("\n=== TESTING add_technical_indicators ===")
try:
    df_with_indicators = add_technical_indicators(df)
    print("✅ add_technical_indicators succeeded!")
    
    # Now test prepare_features
    print("\n=== TESTING prepare_features ===")
    X, y = prepare_features(df_with_indicators)
    print(f"✅ prepare_features succeeded! X shape: {X.shape}, y shape: {y.shape}")
    
except Exception as e:
    print(f"❌ Error: {e}")
    print(f"Error type: {type(e)}")
    
    # Let's manually check what's happening
    print("\n=== MANUAL DEBUGGING ===")
    close_data = df["Close"]
    print(f"close_data type: {type(close_data)}")
    print(f"close_data values type: {type(close_data.values)}")
    print(f"close_data values shape: {close_data.values.shape}")
    
    # Create a proper 1D series
    close_1d = pd.Series(np.array(close_data).ravel(), index=df.index)
    print(f"close_1d type: {type(close_1d)}")
    print(f"close_1d shape: {close_1d.shape}")
    
    # Test ta library with the 1D series
    try:
        rsi = ta.momentum.RSIIndicator(close=close_1d, window=14).rsi()
        print("✅ ta library works with 1D series!")
    except Exception as e2:
        print(f"❌ ta library still fails: {e2}")

  df = yf.download(ticker, start=start_date, end=end_date, interval=interval)
[*********************100%***********************]  1 of 1 completed

Downloading fresh data...
Downloading AAPL data from 2025-01-01 to 2025-08-18...
✅ Data saved to C:\Users\Steph\Desktop\Projects\stock-price-movement-predictor\data\raw\AAPL_2025-01-01_2025-08-18_daily.csv
=== BEFORE PROCESSING ===
df type: <class 'pandas.core.frame.DataFrame'>
df columns: MultiIndex([( 'Close', 'AAPL'),
            (  'High', 'AAPL'),
            (   'Low', 'AAPL'),
            (  'Open', 'AAPL'),
            ('Volume', 'AAPL')],
           names=['Price', 'Ticker'])
df['Close'] type: <class 'pandas.core.frame.DataFrame'>
df['Close'] shape: (155, 1)
df['Close'] head:
Ticker            AAPL
Date                  
2025-01-02  242.987411
2025-01-03  242.499161
2025-01-06  244.133347
2025-01-07  241.353226
2025-01-08  241.841492

=== TESTING add_technical_indicators ===
❌ Error: Data must be 1-dimensional, got ndarray of shape (155, 1) instead
Error type: <class 'ValueError'>

=== MANUAL DEBUGGING ===
close_data type: <class 'pandas.core.frame.DataFrame'>
close_data value




In [33]:
# Quick check - print the current featureengineering.py file content
import inspect
from src.featureengineering import add_technical_indicators

# Print the function source to see what's actually running
print("=== CURRENT add_technical_indicators FUNCTION ===")
print(inspect.getsource(add_technical_indicators))

=== CURRENT add_technical_indicators FUNCTION ===
def add_technical_indicators(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds common technical indicators like SMA, EMA, RSI, MACD, and Bollinger Bands.
    """
    df = df.copy()
    
    # ✅ Ensure 'Close' is a 1D Series
    if isinstance(df["Close"], pd.DataFrame):
        df["Close"] = df["Close"].squeeze()
    
    # Create a guaranteed 1D close series
    close_values = np.array(df["Close"]).flatten()  # This will always be 1D
    close_series = pd.Series(close_values, index=df.index, name="Close")
    
    print(f"Close series shape: {close_series.shape}")
    print(f"Close series type: {type(close_series)}")

    # --- Moving Averages ---
    df["SMA_5"] = close_series.rolling(window=5).mean()
    df["SMA_10"] = close_series.rolling(window=10).mean()
    df["EMA_5"] = close_series.ewm(span=5, adjust=False).mean()
    df["EMA_10"] = close_series.ewm(span=10, adjust=False).mean()

    # --- Momentum Indicators ---
    # Use the

In [1]:
# Cell 1: Import and load data
from src.dataload import load_stock_data
import pandas as pd
import numpy as np
import ta

df = load_stock_data("AAPL", refresh=True)
print("Data loaded successfully")
print(f"df['Close'] type: {type(df['Close'])}")
print(f"df['Close'] shape: {df['Close'].shape}")

ModuleNotFoundError: No module named 'src'

In [1]:
from src.featureengineering import prepare_features

# Generate features and target
X, y, df_features = prepare_features(df)

# Quick checks
print("✅ Features and target created!")
print("X shape:", X.shape)
print("y shape:", y.shape)

# Preview first few rows
df_features.head()


ModuleNotFoundError: No module named 'src'

In [4]:
import sys
sys.path.append("../src")

from featureengineering import prepare_features

# Generate features and target
X, y, df_features = prepare_features(df)

print("✅ Features and target created!")
print("X shape:", X.shape)
print("y shape:", y.shape)

df_features.head()


NameError: name 'df' is not defined

In [3]:
import sys
sys.path.append("../src")

from featureengineering import prepare_features


In [6]:
import sys
sys.path.append("../src")

from dataload import load_stock_data
from featureengineering import prepare_features

# Load data
df = load_stock_data("AAPL", refresh=False)

# Generate features and target
X, y, df_features = prepare_features(df)

print("✅ Features and target created!")
print("X shape:", X.shape)
print("y shape:", y.shape)
df_features.head()


Downloading fresh data...
Downloading AAPL data from 2025-01-01 to 2025-08-18...


  df = yf.download(ticker, start=start_date, end=end_date, interval=interval)
[*********************100%***********************]  1 of 1 completed

✅ Data saved to C:\Users\Steph\Desktop\Projects\stock-price-movement-predictor\data\raw\AAPL_2025-01-01_2025-08-18_daily.csv
Close series shape: (155,)
Close series type: <class 'pandas.core.series.Series'>





ValueError: not enough values to unpack (expected 3, got 2)

In [7]:
import sys
sys.path.append("../src")

from dataload import load_stock_data
from featureengineering import prepare_features

# Load the stock data
df = load_stock_data("AAPL", refresh=False)

# Generate features and target
X, y = prepare_features(df)

print("✅ Features and target created!")
print("X shape:", X.shape)
print("y shape:", y.shape)

# Display first few rows of the features
X.head()


  df = yf.download(ticker, start=start_date, end=end_date, interval=interval)
[*********************100%***********************]  1 of 1 completed

Downloading fresh data...
Downloading AAPL data from 2025-01-01 to 2025-08-18...
✅ Data saved to C:\Users\Steph\Desktop\Projects\stock-price-movement-predictor\data\raw\AAPL_2025-01-01_2025-08-18_daily.csv
Close series shape: (155,)
Close series type: <class 'pandas.core.series.Series'>
✅ Features and target created!
X shape: (121, 15)
y shape: (121,)





Price,Close,High,Low,Open,Volume,SMA_5,SMA_10,EMA_5,EMA_10,RSI_14,MACD,MACD_signal,Bollinger_high,Bollinger_low,Return
0,244.950424,248.082756,244.621228,245.349441,53197400,244.46561,238.553081,243.449033,240.469899,62.711281,2.479947,0.615302,249.059241,221.70451,-0.001139
1,246.496643,248.25234,243.823179,244.331928,51326400,244.964389,240.520267,244.464903,241.565671,64.483829,2.846882,1.061618,249.642259,223.571962,0.006312
2,246.436783,249.389562,244.311994,247.394445,48013300,245.477133,242.454533,245.122196,242.451328,64.356284,3.097148,1.468724,250.6972,224.256008,-0.000243
3,239.773087,244.381801,238.546094,243.733394,44433600,244.577335,243.226642,243.33916,241.964375,52.020917,2.726354,1.72025,250.852707,224.336092,-0.02704
4,236.720566,241.86797,236.481146,238.825414,41153600,242.8755,243.269537,241.132962,241.010955,47.526956,2.16127,1.808454,250.761249,224.248277,-0.012731


In [8]:
import os

# Ensure the processed folder exists
os.makedirs("../data/processed", exist_ok=True)

# Save the processed dataset
df_features.to_csv("../data/processed/AAPL_processed.csv", index=False)

print("✅ Processed data saved to data/processed/AAPL_processed.csv")


NameError: name 'df_features' is not defined

In [9]:
import os
import pandas as pd

# Combine X and y into one DataFrame
df_features = X.copy()
df_features["Target"] = y

# Ensure processed directory exists
os.makedirs("../data/processed", exist_ok=True)

# Save to CSV
df_features.to_csv("../data/processed/AAPL_processed.csv", index=False)

print("✅ Processed data saved to data/processed/AAPL_processed.csv")


✅ Processed data saved to data/processed/AAPL_processed.csv
