In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

In [32]:
# 1. LOAD DATA
print("Loading data...")
df = pd.read_csv("stock_data_final.csv")

if 'Date' not in df.columns:
    if 'Date' in df.index.names:
        df = df.reset_index()
    else:
        # If created by yfinance, the Date is likely the first column but unnamed
        df.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)

# Force Sort - Non-negotiable for Time Series
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by=['Ticker', 'Date'])
else:
    print("‚ö†Ô∏è WARNING: Could not find Date column. Assuming data is already sorted (Risky).")

    # Check if any feature is suspiciously perfect at predicting the target
print("üîç LEAKAGE AUDIT: Checking correlations with Target...")
# We use numeric_only to avoid errors if Date/Ticker are present
correlations = df.corr(numeric_only=True)["Target"].sort_values(ascending=False)
print(correlations.head(5))

if correlations[1] > 0.8: # Index 0 is Target itself
    print("‚ö†Ô∏è WARNING: High correlation detected! Possible Data Leakage.")
else:
    print("‚úÖ Audit Passed: No suspiciously high correlations found.")

# --- 2. REFINED PREDICTORS (Fixing Critique #2) ---
# We removed raw 'SMA_50' and 'SMA_200'. 
# We kept 'Dist_SMA...' because Ratios generalize better than Prices.
predictors = [
    "RSI", "MACD", "MACD_Signal", 
    "Dist_SMA_50", "Dist_SMA_200", 
    "Ret_1d", "Ret_5d", "Day_Range"
]


Loading data...
üîç LEAKAGE AUDIT: Checking correlations with Target...
Target       1.000000
Day_Range    0.044910
Volume       0.013627
RSI          0.003401
Ret_1d       0.001768
Name: Target, dtype: float64
‚úÖ Audit Passed: No suspiciously high correlations found.


  if correlations[1] > 0.8: # Index 0 is Target itself


In [33]:
# 3. CONFIGURE THE MODEL
# n_estimators=200: 200 Decision Trees (Robustness)
# min_samples_split=50:  High number prevents overfitting (Won't learn noise)
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, max_depth=10, 
                               min_samples_leaf=4, class_weight="balanced_subsample", 
                               random_state=1)

In [37]:
# --- 4. BACKTEST WITH BENCHMARK & DRAWDOWN  ---
def backtest_weekly(data, model, predictors, start=150, step=12):
    all_predictions = []
    
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        
        model.fit(train[predictors], train["Target"])
        
        # Predictions
        preds_proba = model.predict_proba(test[predictors])[:,1]
        preds_custom = (preds_proba >= 0.55).astype(int)
        
        combined = pd.DataFrame({
            "Date": test["Date"],
            "Target": test["Target"],
            "Prediction": preds_custom,
            "Actual_Return": test["Future_Close"] / test["Close"], # For Strategy
            "Benchmark_Return": test["Future_Close"] / test["Close"] # For Buy & Hold
        }, index=test.index)
        
        all_predictions.append(combined)
    
    return pd.concat(all_predictions)

In [38]:
def calculate_max_drawdown(returns_series):
    # Calculates the biggest drop from a peak
    cumulative = returns_series.cumprod()
    peak = cumulative.cummax()
    drawdown = (cumulative - peak) / peak
    return drawdown.min()

In [39]:
# --- 5. EXECUTION ---
print("\nüöÄ Running Advanced Backtest...")

tickers = df["Ticker"].unique()
metrics = []

for ticker in tickers:
    ticker_data = df[df["Ticker"] == ticker].copy().reset_index(drop=True)
    
    # We only want to trade/predict once every 5 days to avoid overlap confusion
    ticker_data = ticker_data.iloc[::5, :]
    
    if len(ticker_data) > 200:
        predictions = backtest_weekly(ticker_data, model, predictors)
        
        # --- STRATEGY PERFORMANCE ---
        # Logic: If pred=1, we get Actual Return. If pred=0, we stay in Cash (Return=1.0)
        predictions["Strategy_Return"] = np.where(predictions["Prediction"] == 1, predictions["Actual_Return"], 1.0)
        
        # Calculate Cumulative Returns
        strategy_cum = predictions["Strategy_Return"].cumprod()
        benchmark_cum = predictions["Benchmark_Return"].cumprod()
        
        total_return = strategy_cum.iloc[-1]
        benchmark_return = benchmark_cum.iloc[-1]
        
        # --- DRAWDOWN & METRICS ---
        max_dd = calculate_max_drawdown(predictions["Strategy_Return"])
        precision = precision_score(predictions["Target"], predictions["Prediction"], zero_division=0)
        trades = predictions["Prediction"].sum()
        
        print(f"\nüîπ {ticker}")
        print(f"   Win Rate: {precision:.2%}")
        print(f"   Trades: {trades}")
        print(f"   Strategy Return: {total_return:.2f}x")
        print(f"   Benchmark (Buy&Hold): {benchmark_return:.2f}x")
        print(f"   Max Drawdown: {max_dd:.2%}")
        
        metrics.append({
            "Ticker": ticker,
            "Win Rate": precision,
            "Strategy Return": total_return,
            "Benchmark Return": benchmark_return,
            "Max Drawdown": max_dd
        })


üöÄ Running Advanced Backtest...

üîπ HCLTECH.NS
   Win Rate: 50.00%
   Trades: 24
   Strategy Return: 1.22x
   Benchmark (Buy&Hold): 1.44x
   Max Drawdown: -8.41%

üîπ HDFCBANK.NS
   Win Rate: 25.00%
   Trades: 12
   Strategy Return: 1.08x
   Benchmark (Buy&Hold): 1.30x
   Max Drawdown: -3.01%

üîπ INFY.NS
   Win Rate: 47.37%
   Trades: 19
   Strategy Return: 1.14x
   Benchmark (Buy&Hold): 1.30x
   Max Drawdown: -14.66%

üîπ RELIANCE.NS
   Win Rate: 54.55%
   Trades: 11
   Strategy Return: 1.16x
   Benchmark (Buy&Hold): 1.30x
   Max Drawdown: -5.54%

üîπ SBIN.NS
   Win Rate: 18.75%
   Trades: 16
   Strategy Return: 0.95x
   Benchmark (Buy&Hold): 1.93x
   Max Drawdown: -12.44%

üîπ TCS.NS
   Win Rate: 30.00%
   Trades: 30
   Strategy Return: 0.87x
   Benchmark (Buy&Hold): 1.01x
   Max Drawdown: -23.97%


In [40]:
# --- 6. FINAL REPORT ---
if metrics:
    metrics_df = pd.DataFrame(metrics)
    print("\nüìä FINAL AUDIT REPORT")
    print(metrics_df[["Ticker", "Win Rate", "Strategy Return", "Benchmark Return", "Max Drawdown"]])
    
    avg_strat = metrics_df["Strategy Return"].mean()
    avg_bench = metrics_df["Benchmark Return"].mean()
    
    print(f"\nüèÜ Average Strategy Return: {avg_strat:.2f}x")
    print(f"üìâ Average Benchmark Return: {avg_bench:.2f}x")
    
    if avg_strat > avg_bench:
        print("‚úÖ SUCCESS: Model beats the market on average!")
    else:
        print("‚ö†Ô∏è REALITY CHECK: Model underperforms Buy & Hold.")


üìä FINAL AUDIT REPORT
        Ticker  Win Rate  Strategy Return  Benchmark Return  Max Drawdown
0   HCLTECH.NS  0.500000         1.222977          1.439027     -0.084147
1  HDFCBANK.NS  0.250000         1.082486          1.302212     -0.030124
2      INFY.NS  0.473684         1.143042          1.300268     -0.146602
3  RELIANCE.NS  0.545455         1.157393          1.296399     -0.055426
4      SBIN.NS  0.187500         0.952757          1.926506     -0.124393
5       TCS.NS  0.300000         0.873329          1.009174     -0.239735

üèÜ Average Strategy Return: 1.07x
üìâ Average Benchmark Return: 1.38x
‚ö†Ô∏è REALITY CHECK: Model underperforms Buy & Hold.


In [41]:
import joblib

# 1. RETRAIN ON ALL DATA (To make it as smart as possible for the demo)
# We use the Aggressive Settings that worked best (55% on Reliance)
full_model = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=10,
    max_depth=None,
    min_samples_leaf=2,
    class_weight="balanced_subsample", 
    random_state=1
)

# We use all available data to train
full_model.fit(df[predictors], df["Target"])

# 2. SAVE THE MODEL
joblib.dump(full_model, "random_forest_model.pkl")
print("‚úÖ Model saved as 'random_forest_model.pkl'")

‚úÖ Model saved as 'random_forest_model.pkl'
