# Financial Time Series Forecasting: ARIMA, SARIMA, Holt-Winters, Prophet, LSTM

## Fixed Version - Compatible with All Scikit-learn Versions

This notebook uses your local CSV file directly:
- File: `/Users/kushal/Downloads/Major_project-main/Accurcy/expenses_income_summary.csv`
- Compares 5 forecasting algorithms
- Shows accuracy metrics and winner


In [None]:
# Install packages (uncomment if needed)
# !pip install -q numpy pandas matplotlib seaborn scikit-learn statsmodels prophet tensorflow

print("üì¶ Starting analysis...")

In [None]:
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

# Classical models
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Prophet
try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except Exception as e:
    Prophet = None
    PROPHET_AVAILABLE = False
    print('Prophet not available')

# TensorFlow for LSTM
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    TENSORFLOW_AVAILABLE = True
except Exception as e:
    tf = None
    TENSORFLOW_AVAILABLE = False
    print('TensorFlow not available')

print(f'‚úÖ Prophet available: {PROPHET_AVAILABLE}')
print(f'‚úÖ TensorFlow available: {TENSORFLOW_AVAILABLE}')

## Load CSV File Directly

In [None]:
# Direct file path
CSV_PATH = '/Users/kushal/Downloads/Major_project-main/Accurcy/expenses_income_summary.csv'
TARGET_TYPE = 'EXPENSE'  # Change to 'INCOME' if needed
TEST_RATIO = 0.2  # 20% for testing

def load_data(csv_path):
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f'‚ùå CSV file not found: {csv_path}')
    
    df = pd.read_csv(csv_path)
    print(f'üìä Raw data shape: {df.shape}')
    print(f'üìä Columns: {list(df.columns)}')
    
    # Ensure required columns exist
    required = {'Date','amount','type'}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f'‚ùå Missing columns in CSV: {missing}')
    
    # Parse date
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df = df.dropna(subset=['Date'])
    
    # Clean amounts (remove commas)
    df['amount'] = (df['amount']
        .astype(str)
        .str.replace(',', '', regex=False)
        .astype(float))
    
    return df

# Load the data
print(f'üìÅ Loading CSV from: {CSV_PATH}')
df_raw = load_data(CSV_PATH)

print(f'\nüìà Date range: {df_raw["Date"].min()} to {df_raw["Date"].max()}')
print(f'üìà Types available: {df_raw["type"].unique()}')
print(f'üìà Total transactions: {len(df_raw)}')

df_raw.head()

## Prepare Daily Series

In [None]:
# Filter by target type and aggregate daily
assert TARGET_TYPE in {'EXPENSE','INCOME'}
df = df_raw[df_raw['type'] == TARGET_TYPE].copy()
df = df[['Date','amount']].sort_values('Date')

# Aggregate to daily totals
daily = (df.set_index('Date')['amount']
         .resample('D').sum().fillna(0.0))
daily.name = TARGET_TYPE

print(f'üìä Daily series created:')
print(f'üìä Date range: {daily.index.min()} ‚Üí {daily.index.max()}')
print(f'üìä Total days: {len(daily)}')
print(f'üìä Total {TARGET_TYPE}: ‚Çπ{daily.sum():.2f}')
print(f'üìä Average daily {TARGET_TYPE}: ‚Çπ{daily.mean():.2f}')
print(f'üìä Days with zero {TARGET_TYPE}: {(daily == 0).sum()}')

# Plot the series
plt.figure(figsize=(15, 5))
plt.plot(daily.index, daily.values, alpha=0.8)
plt.title(f'Daily {TARGET_TYPE} Time Series')
plt.ylabel('Amount (‚Çπ)')
plt.xlabel('Date')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print('\nüìä Last 10 days:')
daily.tail(10)

## Train/Test Split

In [None]:
def chrono_train_test_split(series, test_ratio=0.2):
    n = len(series)
    h = max(1, int(n * test_ratio))
    train = series.iloc[:-h]
    test = series.iloc[-h:]
    return train, test

y_train, y_test = chrono_train_test_split(daily, TEST_RATIO)

print(f'üîÑ Train/Test Split:')
print(f'üîÑ Train size: {len(y_train)} days')
print(f'üîÑ Test size: {len(y_test)} days')
print(f'üîÑ Train period: {y_train.index.min()} to {y_train.index.max()}')
print(f'üîÑ Test period: {y_test.index.min()} to {y_test.index.max()}')

## Evaluation Metrics (FIXED VERSION)

In [None]:
def mape(y_true, y_pred):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    denom = np.where(y_true == 0, 1e-8, np.abs(y_true))
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100

def evaluate(y_true, y_pred, model_name='Model'):
    mae = mean_absolute_error(y_true, y_pred)
    # FIXED: Use np.sqrt instead of squared parameter
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mp = mape(y_true, y_pred)
    print(f'üìà {model_name} - MAE: ‚Çπ{mae:.2f}, RMSE: ‚Çπ{rmse:.2f}, MAPE: {mp:.2f}%')
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mp}

print('‚úÖ Evaluation functions ready!')

## 1Ô∏è‚É£ ARIMA Model

In [None]:
print('üîÑ Training ARIMA model...')
arima_forecast = None
try:
    arima_model = ARIMA(y_train, order=(1,1,1))
    arima_fit = arima_model.fit()
    arima_forecast = arima_fit.forecast(steps=len(y_test))
    arima_metrics = evaluate(y_test.values, arima_forecast, 'ARIMA(1,1,1)')
    print('‚úÖ ARIMA completed!')
except Exception as e:
    print(f'‚ùå ARIMA failed: {e}')
    arima_metrics = {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan}

## 2Ô∏è‚É£ SARIMA Model

In [None]:
print('üîÑ Training SARIMA model...')
sarima_forecast = None
try:
    sarima_model = SARIMAX(y_train, order=(1,1,1), seasonal_order=(1,1,1,7))
    sarima_fit = sarima_model.fit(disp=False)
    sarima_forecast = sarima_fit.forecast(steps=len(y_test))
    sarima_metrics = evaluate(y_test.values, sarima_forecast, 'SARIMA(1,1,1)(1,1,1,7)')
    print('‚úÖ SARIMA completed!')
except Exception as e:
    print(f'‚ùå SARIMA failed: {e}')
    sarima_metrics = {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan}

## 3Ô∏è‚É£ Holt-Winters Model

In [None]:
print('üîÑ Training Holt-Winters model...')
hw_forecast = None
try:
    hw_model = ExponentialSmoothing(y_train, trend='add', seasonal='add', seasonal_periods=7)
    hw_fit = hw_model.fit(optimized=True)
    hw_forecast = hw_fit.forecast(len(y_test))
    hw_metrics = evaluate(y_test.values, hw_forecast.values, 'Holt-Winters')
    print('‚úÖ Holt-Winters completed!')
except Exception as e:
    try:
        # Fallback without seasonality
        hw_model = ExponentialSmoothing(y_train, trend='add')
        hw_fit = hw_model.fit(optimized=True)
        hw_forecast = hw_fit.forecast(len(y_test))
        hw_metrics = evaluate(y_test.values, hw_forecast.values, 'Holt-Winters (no seasonal)')
        print('‚úÖ Holt-Winters (simplified) completed!')
    except Exception as e2:
        print(f'‚ùå Holt-Winters failed: {e2}')
        hw_metrics = {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan}

## 4Ô∏è‚É£ Prophet Model

In [None]:
print('üîÑ Training Prophet model...')
prophet_forecast = None
if not PROPHET_AVAILABLE:
    print('‚ùå Prophet not available')
    prophet_metrics = {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan}
else:
    try:
        df_p = y_train.reset_index().rename(columns={'Date':'ds', TARGET_TYPE:'y'})
        m = Prophet(daily_seasonality=True, weekly_seasonality=True, yearly_seasonality=False)
        m.fit(df_p)
        future = pd.DataFrame({'ds': y_test.index})
        fc = m.predict(future)
        prophet_forecast = fc['yhat'].values
        prophet_metrics = evaluate(y_test.values, prophet_forecast, 'Prophet')
        print('‚úÖ Prophet completed!')
    except Exception as e:
        print(f'‚ùå Prophet failed: {e}')
        prophet_metrics = {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan}

## 5Ô∏è‚É£ LSTM Model

In [None]:
print('üîÑ Training LSTM model...')
lstm_forecast = None
if not TENSORFLOW_AVAILABLE:
    print('‚ùå TensorFlow not available')
    lstm_metrics = {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan}
else:
    try:
        # Scale data
        scaler = MinMaxScaler()
        y_train_vals = y_train.values.reshape(-1,1)
        scaler.fit(y_train_vals)
        y_train_scaled = scaler.transform(y_train_vals)
        
        # Create sequences
        WINDOW = min(14, len(y_train_scaled) // 2)
        def make_sequences(arr, window):
            X, y = [], []
            for i in range(window, len(arr)):
                X.append(arr[i-window:i, 0])
                y.append(arr[i, 0])
            return np.array(X), np.array(y)
        
        X_tr, y_tr = make_sequences(y_train_scaled, WINDOW)
        if len(X_tr) < 5:
            raise RuntimeError('Not enough data for LSTM')
        
        X_tr = X_tr.reshape((X_tr.shape[0], X_tr.shape[1], 1))
        
        # Build model
        model = keras.Sequential([
            layers.Input(shape=(WINDOW,1)),
            layers.LSTM(32),
            layers.Dense(16, activation='relu'),
            layers.Dense(1)
        ])
        model.compile(optimizer='adam', loss='mse')
        model.fit(X_tr, y_tr, epochs=50, batch_size=min(16, len(X_tr)), verbose=0)
        
        # Generate forecasts
        hist = list(y_train_scaled[-WINDOW:, 0])
        preds = []
        for _ in range(len(y_test)):
            x = np.array(hist[-WINDOW:]).reshape((1, WINDOW, 1))
            p = model.predict(x, verbose=0)[0,0]
            preds.append(p)
            hist.append(p)
        
        lstm_forecast = scaler.inverse_transform(np.array(preds).reshape(-1,1)).ravel()
        lstm_metrics = evaluate(y_test.values, lstm_forecast, 'LSTM')
        print('‚úÖ LSTM completed!')
    except Exception as e:
        print(f'‚ùå LSTM failed: {e}')
        lstm_metrics = {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan}

## üèÜ FINAL RESULTS

In [None]:
# Compile results
results = []
results.append({'Model':'ARIMA', **arima_metrics})
results.append({'Model':'SARIMA', **sarima_metrics})
results.append({'Model':'Holt-Winters', **hw_metrics})
results.append({'Model':'Prophet', **prophet_metrics})
results.append({'Model':'LSTM', **lstm_metrics})

res_df = pd.DataFrame(results)
# Sort by RMSE (lower is better)
res_df_sorted = res_df.sort_values('RMSE')

print("\n" + "="*80)
print("üèÜ ALGORITHM ACCURACY COMPARISON RESULTS")
print("="*80)
print(res_df_sorted.to_string(index=False, float_format='%.2f'))
print("="*80)

# Find best model
valid_results = res_df_sorted.dropna(subset=['RMSE'])
if len(valid_results) > 0:
    best_model = valid_results.iloc[0]['Model']
    best_rmse = valid_results.iloc[0]['RMSE']
    best_mae = valid_results.iloc[0]['MAE']
    best_mape = valid_results.iloc[0]['MAPE']
    
    print(f"\nü•á WINNER: {best_model}")
    print(f"üìä RMSE: ‚Çπ{best_rmse:.2f} (Root Mean Square Error)")
    print(f"üìä MAE: ‚Çπ{best_mae:.2f} (Mean Absolute Error)")
    print(f"üìä MAPE: {best_mape:.2f}% (Mean Absolute Percentage Error)")
    print(f"\nüí° {best_model} is the most accurate algorithm for your {TARGET_TYPE} forecasting!")
    print(f"\nüöÄ Use {best_model} in your Flask application for best results!")
else:
    print("‚ùå No valid results found")

res_df_sorted

## üìä Forecast Visualization

In [None]:
# Main forecast plot
plt.figure(figsize=(16,8))

# Plot train and test data
plt.plot(y_train.index, y_train.values, label='Train Data', color='blue', alpha=0.6)
plt.plot(y_test.index, y_test.values, label='Actual Test', color='black', linewidth=3, marker='o')

# Plot forecasts
if arima_forecast is not None: 
    plt.plot(y_test.index, arima_forecast, label='ARIMA', linestyle='--', alpha=0.8, linewidth=2)
if sarima_forecast is not None: 
    plt.plot(y_test.index, sarima_forecast, label='SARIMA', linestyle='--', alpha=0.8, linewidth=2)
if hw_forecast is not None: 
    plt.plot(y_test.index, hw_forecast, label='Holt-Winters', linestyle='--', alpha=0.8, linewidth=2)
if prophet_forecast is not None: 
    plt.plot(y_test.index, prophet_forecast, label='Prophet', linestyle='--', alpha=0.8, linewidth=2)
if lstm_forecast is not None: 
    plt.plot(y_test.index, lstm_forecast, label='LSTM', linestyle='--', alpha=0.8, linewidth=2)

plt.title(f'{TARGET_TYPE} Forecasting: All Models Comparison', fontsize=16)
plt.ylabel('Amount (‚Çπ)', fontsize=12)
plt.xlabel('Date', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Zoomed plot - test period only
plt.figure(figsize=(14,6))
plt.plot(y_test.index, y_test.values, label='Actual', color='black', linewidth=4, marker='o', markersize=8)

if arima_forecast is not None: 
    plt.plot(y_test.index, arima_forecast, label='ARIMA', marker='s', alpha=0.8, linewidth=2)
if sarima_forecast is not None: 
    plt.plot(y_test.index, sarima_forecast, label='SARIMA', marker='^', alpha=0.8, linewidth=2)
if hw_forecast is not None: 
    plt.plot(y_test.index, hw_forecast, label='Holt-Winters', marker='d', alpha=0.8, linewidth=2)
if prophet_forecast is not None: 
    plt.plot(y_test.index, prophet_forecast, label='Prophet', marker='v', alpha=0.8, linewidth=2)
if lstm_forecast is not None: 
    plt.plot(y_test.index, lstm_forecast, label='LSTM', marker='*', alpha=0.8, linewidth=2, markersize=10)

plt.title(f'{TARGET_TYPE} Forecasts - Test Period Detail', fontsize=16)
plt.ylabel('Amount (‚Çπ)', fontsize=12)
plt.xlabel('Date', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

if 'best_model' in locals():
    print(f"\n‚úÖ Analysis complete! Best algorithm: {best_model} with RMSE: ‚Çπ{best_rmse:.2f}")
    print(f"üí° Implement {best_model} in your Flask forecast.py route!")