In [2]:
import pandas as pd
import os

df = pd.read_csv("../Data/EDA_transformed.csv")

# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Remove duplicates
df = df.drop_duplicates()

# Create output folder
os.makedirs("Stock_Preprocessed", exist_ok=True)

# Preprocess each stock separately
for stock in df['Stocks'].unique():
    stock_df = df[df['Stocks'] == stock].copy()
    
    # Sort by Date
    stock_df = stock_df.sort_values('Date')
    
    # Set Date as index and drop duplicates again
    stock_df = stock_df.drop_duplicates(subset='Date')
    stock_df.set_index('Date', inplace=True)
    
    # Reindex with business days
    stock_df = stock_df.asfreq('B')
    
    # Interpolate all numeric columns
    numeric_cols = stock_df.select_dtypes(include='number').columns
    stock_df[numeric_cols] = stock_df[numeric_cols].interpolate(method='linear')
    
    # Forward fill any remaining missing values (recommended syntax)
    stock_df[numeric_cols] = stock_df[numeric_cols].ffill()
    
    # Save cleaned stock data
    stock_df.to_csv(f"Stock_Preprocessed/{stock}_cleaned.csv")

print("✅ Preprocessing done. SARIMA-ready files saved in Stock_Preprocessed/")

✅ Preprocessing done. SARIMA-ready files saved in Stock_Preprocessed/
