<a href="https://colab.research.google.com/github/MaheshUmale/COLAB_FILES/blob/main/Colab_Trading_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import re
import time
import zipfile
from concurrent.futures import ProcessPoolExecutor, as_completed

# --- COLAB SETUP & DATA DOWNLOAD ---
# Data URL provided by the user
DATA_URL = "https://storage.googleapis.com/kaggle-data-sets/2575525/12691112/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20251001%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20251001T235419Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=0f9c7fd81c9801a691e9bed7879114c4c122987a0ed10445403f0dae54d50d333adaa1d0bf0af42b95facb7860a6d3d61a866409b2574ba9223c3c5ae97aea974508e75e9414c6ab9f021e4e2d7e521481da9595f4a2b30c0d6b6dd48a8e07d357386ae3594335aa2255f2c7862398e7d11086c46cca0e9698bdcbca9ae6b0a583f076699989cfbaef2e5486873e97a01f029f8545f502354ce75139f01362260f54831e820c7e67846d4800d69092b482c9a8992f1b509c99d4774c8e419c8c97b61b25f0ae591881ee45cd4e2b55779ba8ad98d7fd2a9cf3619926937851939e0dad53"

def setup_colab_environment(url, zip_name="archive.zip", data_dir="/content/NSE_STOCK_DATA"):
    """Downloads data, unzips it, and creates the target directory structure."""

    # Use shell command to download the file directly in Colab
    print(f"Downloading data from URL...")
    os.system(f"wget -O {zip_name} \"{url}\"")

    # Create the directory to extract data into
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # Unzip the file
    print(f"Unzipping data into {data_dir}...")
    try:
        # Note: The ZIP structure might contain a subfolder,
        # so we extract all to the current directory and move them.
        with zipfile.ZipFile(zip_name, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        print("Download and extraction complete.")
    except Exception as e:
        print(f"Error during unzipping: {e}")
        return None

    # Search for files in the extracted directory to confirm path
    # Colab path: /content/NSE_STOCK_DATA
    # If the zip extracted to a subdirectory (e.g., /content/NSE_STOCK_DATA/bundle/archive)
    # we need to adjust the path to the actual location of the CSV files.

    # Simple check for the existence of files like '*_minute.csv'
    if not any(f.endswith('_minute.csv') for f in os.listdir(data_dir)):
        # If no files found, try traversing one level deeper (common Kaggle/Google Storage behavior)
        print("CSV files not found directly in the target folder. Searching deeper...")
        for root, dirs, files in os.walk(data_dir):
            if any(f.endswith('_minute.csv') for f in files):
                data_dir = root
                print(f"Found CSV files in new directory: {data_dir}")
                break

    return data_dir

# --- START: DATA GENERATION LOGIC (Combined from generate_dataset_optimized.py) ---

def calculate_rsi(df, window=14):
    """Calculates the Relative Strength Index (RSI)."""
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = np.where(loss == 0, np.inf, gain / loss)
    df['rsi'] = 100 - (100 / (1 + RS))
    return df

def calculate_adx(df, window=14):
    """Calculates the Average Directional Index (ADX)."""
    df['high_low'] = df['high'] - df['low']
    df['high_prev_close'] = abs(df['high'] - df['close'].shift(1))
    df['low_prev_close'] = abs(df['low'] - df['close'].shift(1))
    df['tr'] = df[['high_low', 'high_prev_close', 'low_prev_close']].max(axis=1)

    df['plus_dm'] = np.where((df['high'] > df['high'].shift(1)) & (df['high'] - df['high'].shift(1) > df['low'].shift(1) - df['low']), df['high'] - df['high'].shift(1), 0)
    df['minus_dm'] = np.where((df['low'].shift(1) > df['low']) & (df['low'].shift(1) - df['low'] > df['high'] - df['high'].shift(1)), df['low'].shift(1) - df['low'], 0)

    temp_atr = df['tr'].ewm(alpha=1/window, adjust=False).mean()
    df['plus_di'] = (df['plus_dm'].ewm(alpha=1/window, adjust=False).mean() / temp_atr) * 100
    df['minus_di'] = (df['minus_dm'].ewm(alpha=1/window, adjust=False).mean() / temp_atr) * 100

    df['dx'] = abs(df['plus_di'] - df['minus_di']) / (df['plus_di'] + df['minus_di']) * 100
    df['adx'] = df['dx'].ewm(alpha=1/window, adjust=False).mean()

    df.drop(columns=['high_low', 'high_prev_close', 'low_prev_close', 'tr', 'plus_dm', 'minus_dm', 'plus_di', 'minus_di', 'dx'], inplace=True, errors='ignore')
    return df

def calculate_vwap(df):
    """Calculates the Volume Weighted Average Price (VWAP) for the entire dataset."""
    df['typical_price'] = (df['high'] + df['low'] + df['close']) / 3
    df['tpv'] = df['typical_price'] * df['volume']

    # Calculate VWAP cumulatively (VWAP usually resets daily, but simplified here)
    df['cum_tpv'] = df['tpv'].cumsum()
    df['cum_volume'] = df['volume'].cumsum()
    df['vwap'] = df['cum_tpv'] / df['cum_volume']

    df.drop(columns=['typical_price', 'tpv', 'cum_tpv', 'cum_volume'], inplace=True, errors='ignore')
    return df

def calculate_indicators(df, atr_period=14, bb_period=20, kc_period=20, kc_multiplier=2.0):
    """Calculates TTM Squeeze, RVOL, RSI, ADX, SMA distances, and VWAP."""

    # ATR Calculation (Must be first for KC and Normalization)
    df['high_low'] = df['high'] - df['low']
    df['high_prev_close'] = abs(df['high'] - df['close'].shift(1))
    df['low_prev_close'] = abs(df['low'] - df['close'].shift(1))
    df['tr'] = df[['high_low', 'high_prev_close', 'low_prev_close']].max(axis=1)
    df['atr'] = df['tr'].ewm(alpha=1/atr_period, adjust=False).mean()

    # Bollinger Bands
    df['bb_sma'] = df['close'].rolling(window=bb_period).mean()
    df['bb_std'] = df['close'].rolling(window=bb_period).std()
    df['bb_upper'] = df['bb_sma'] + (df['bb_std'] * 2)
    df['bb_lower'] = df['bb_sma'] - (df['bb_std'] * 2)

    # Keltner Channels
    df['kc_sma'] = df['close'].rolling(window=kc_period).mean()
    df['kc_upper'] = df['kc_sma'] + (df['atr'] * kc_multiplier)
    df['kc_lower'] = df['kc_sma'] - (df['atr'] * kc_multiplier)

    # RVOL
    df['avg_volume'] = df['volume'].rolling(window=20).mean()
    df['rvol'] = df['volume'] / df['avg_volume']

    # Squeeze Status
    df['squeeze_on'] = (df['bb_lower'] > df['kc_lower']) & (df['bb_upper'] < df['kc_upper'])

    # NEW: Other Indicators
    df = calculate_rsi(df)
    df = calculate_adx(df)
    df = calculate_vwap(df)

    # SMA Distance Calculation
    df['sma_50'] = df['close'].rolling(window=50).mean()
    df['sma_200'] = df['close'].rolling(window=200).mean()

    df['dist_from_sma_50'] = (df['close'] - df['sma_50']) / df['atr']
    df['dist_from_sma_200'] = (df['close'] - df['sma_200']) / df['atr']

    # Drop intermediate columns
    df.drop(['high_low', 'high_prev_close', 'low_prev_close', 'tr',
             'avg_volume', 'bb_sma', 'bb_std', 'kc_sma', 'atr',
             'sma_50', 'sma_200'],
            axis=1, inplace=True, errors='ignore')

    return df

def load_and_resample_data(directory, symbol, timeframe_minutes):
    """Loads and resamples 1-minute data, returning None on error."""
    filename = f"{symbol}_minute.csv"
    filepath = os.path.join(directory, filename)
    if not os.path.exists(filepath):
        # Colab's default unzip behavior might put files directly in /content
        alt_filepath = os.path.join('/content', filename)
        if os.path.exists(alt_filepath):
             filepath = alt_filepath
        else:
            # print(f"Data file not found: {filepath}")
            return None

    # Use low_memory=False for better performance on larger CSVs
    df = pd.read_csv(filepath, index_col='date', parse_dates=True, low_memory=False)

    if timeframe_minutes == 1440:
        resample_period = '1D'
    else:
        resample_period = f'{timeframe_minutes}min'

    resampled_df = df.resample(resample_period).agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum'
    }).dropna()

    return resampled_df

def generate_training_data(primary_df, mtf_dfs, symbol, primary_timeframe_min):
    """
    Generates training data for the primary timeframe (3min), looking up MTF context.
    """
    training_data = []

    # Identify MTF contexts (Timeframes other than the primary one)
    mtf_timeframes = sorted([tf for tf in mtf_dfs.keys() if tf != primary_timeframe_min])

    for i in range(200, len(primary_df) - 1): # Start after 200 bars to ensure all indicators are populated
        signal = False
        direction = None

        was_in_squeeze = primary_df['squeeze_on'].iloc[i-1]
        is_in_squeeze = primary_df['squeeze_on'].iloc[i]

        current_close = primary_df['close'].iloc[i]
        current_vwap = primary_df['vwap'].iloc[i]

        # 1. TTM Squeeze Breakout Check
        is_bullish_breakout = was_in_squeeze and not is_in_squeeze and current_close > primary_df['bb_upper'].iloc[i]
        is_bearish_breakout = was_in_squeeze and not is_in_squeeze and current_close < primary_df['bb_lower'].iloc[i]

        # 2. RVOL Filter
        rvol_ok = primary_df['rvol'].iloc[i] > 2

        # 3. VWAP Filter
        vwap_ok = False
        if is_bullish_breakout:
            vwap_ok = current_close > current_vwap
            if vwap_ok:
                signal = True
                direction = 'long'
        elif is_bearish_breakout:
            vwap_ok = current_close < current_vwap
            if vwap_ok:
                signal = True
                direction = 'short'

        # Final Signal Check
        if signal and rvol_ok:

            # Capture base features
            features = {
                'symbol': symbol,
                'timeframe': primary_timeframe_min,
                'date': primary_df.index[i],
                'direction': direction,
                'rvol': primary_df['rvol'].iloc[i],
                'bb_width': primary_df['bb_upper'].iloc[i] - primary_df['bb_lower'].iloc[i],
                'kc_width': primary_df['kc_upper'].iloc[i] - primary_df['kc_lower'].iloc[i],
                'close_bb_dist': primary_df['close'].iloc[i] - primary_df['bb_upper'].iloc[i] if direction == 'long' else primary_df['close'].iloc[i] - primary_df['bb_lower'].iloc[i],
                'rsi': primary_df['rsi'].iloc[i],
                'adx': primary_df['adx'].iloc[i],
                'dist_from_sma_50': primary_df['dist_from_sma_50'].iloc[i],
                'dist_from_sma_200': primary_df['dist_from_sma_200'].iloc[i],
                'vwap_diff': primary_df['vwap'].iloc[i] - primary_df['close'].iloc[i]
            }

            # Capture MTF Confluence features
            current_time = primary_df.index[i]
            for tf_min in mtf_timeframes:
                tf_label = f'is_{tf_min}min_sqz' if tf_min != 1440 else 'is_1D_sqz'

                if tf_min == 1440:
                    lookup_time = current_time.normalize() # Start of the day
                else:
                    lookup_time = current_time.floor(f'{tf_min}min')

                # Look up the squeeze status in the higher timeframe DF
                try:
                    # Use index slicing for more robust lookup near floor time
                    mtf_bar = mtf_dfs[tf_min].loc[mtf_dfs[tf_min].index <= current_time].iloc[-1]
                    features[tf_label] = int(mtf_bar['squeeze_on'])
                except (KeyError, IndexError):
                    # If lookup fails or no earlier bar exists, treat as not in squeeze (0)
                    features[tf_label] = 0

            # Simulate the trade to get the outcome (label)
            # Check for sufficient future data (i+1 for entry, minimum i+2 for exit bar)
            if i + 1 >= len(primary_df):
                 continue # Skip if no entry bar is available

            entry_price = primary_df['open'].iloc[i+1]
            label = 0  # Default to loss

            # Risk/Reward (using the same 1:2 logic)
            if direction == 'long':
                risk = entry_price - primary_df['bb_lower'].iloc[i]
                stop_loss = entry_price - risk
                take_profit = entry_price + (2 * risk)
            else: # short
                risk = primary_df['bb_upper'].iloc[i] - entry_price
                stop_loss = entry_price + risk
                take_profit = entry_price - (2 * risk)

            # Look ahead to find the exit
            for j in range(i + 1, len(primary_df)):
                current_low = primary_df['low'].iloc[j]
                current_high = primary_df['high'].iloc[j]
                exit_found = False

                if direction == 'long':
                    if current_high >= take_profit:
                        label = 1
                        exit_found = True
                    elif current_low <= stop_loss:
                        label = 0
                        exit_found = True
                else: # short
                    if current_low <= take_profit:
                        label = 1
                        exit_found = True
                    elif current_high >= stop_loss:
                        label = 0
                        exit_found = True

                if exit_found:
                    features['outcome'] = label
                    training_data.append(features)
                    break

    return pd.DataFrame(training_data)

def process_single_symbol(symbol, directory, timeframes_to_test):
    """
    Loads ALL timeframes for a symbol, processes them, and generates training data
    from the PRIMARY timeframe (lowest TF) using the others for context.
    """
    primary_timeframe_min = min(timeframes_to_test)
    mtf_dfs = {}

    # print(f"--- START: Loading and calculating data for {symbol} ---")

    # 1. Load and Calculate Indicators for ALL Timeframes
    for timeframe in timeframes_to_test:
        df = load_and_resample_data(directory, symbol, timeframe)
        if df is not None:
            try:
                df = calculate_indicators(df)
                df.dropna(inplace=True)
                mtf_dfs[timeframe] = df
                # print(f"[{symbol}] Calculated features for {timeframe}min.")
            except Exception as e:
                print(f"Error calculating indicators for {symbol} ({timeframe}min): {e}")

    # 2. Generate Training Data (Only from the Primary TF)
    # Require at least one bar of each timeframe plus 200 bars for indicator lookback
    if primary_timeframe_min in mtf_dfs and len(mtf_dfs[primary_timeframe_min]) > 200:
        primary_df = mtf_dfs[primary_timeframe_min]
        try:
            symbol_dataset = generate_training_data(primary_df, mtf_dfs, symbol, primary_timeframe_min)

            if not symbol_dataset.empty:
                return symbol_dataset
            else:
                 pass # print(f"[{symbol}] Primary TF ({primary_timeframe_min}min) generated no data after filters.")
        except Exception as e:
             print(f"Error generating training data for {symbol}: {e}")

    return None

def load_symbols_from_dir(directory):
    """Extracts stock symbols from filenames in a directory."""
    csv_filenames = os.listdir(directory)
    pattern = r"(.*)_minute.csv"
    symbols = []
    for filename in csv_filenames:
        match = re.search(pattern, filename)
        if match:
            symbols.append(match.group(1))
    return symbols

# --- END: DATA GENERATION LOGIC ---

# --- START: MODEL TRAINING LOGIC (Combined from model_training.py) ---

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

def preprocess_and_split(data):
    """Handles time-series splitting, feature encoding, and scaling."""
    # 1. Define Features and Label
    X_features = [
        'rvol', 'bb_width', 'kc_width', 'close_bb_dist', 'rsi', 'adx',
        'dist_from_sma_50', 'dist_from_sma_200', 'vwap_diff',
        'is_5min_sqz', 'is_15min_sqz', 'is_30min_sqz', 'is_60min_sqz', 'is_1D_sqz'
    ]

    categorical_features = ['direction']
    all_features = X_features + categorical_features

    X = data[all_features]
    y = data['outcome']

    # 2. Time-Series Split (80% Train, 20% Test)
    test_size = 0.2
    split_index = int(len(X) * (1 - test_size))

    X_train_raw = X.iloc[:split_index]
    X_test_raw = X.iloc[split_index:]
    y_train = y.iloc[:split_index]
    y_test = y.iloc[split_index:]

    # 3. Preprocessing Pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='drop'
    )

    return X_train_raw, X_test_raw, y_train, y_test, preprocessor

def train_and_evaluate(X_train_raw, X_test_raw, y_train, y_test, preprocessor):
    """Trains a Random Forest Classifier and evaluates its performance."""

    # Scikit-learn Random Forest Classifier optimized for CPU cores (n_jobs=-1)
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=500,
            max_depth=15,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1               # Use all CPU cores!
        ))
    ])

    print("\n--- Starting Model Training (Leveraging all Colab CPU cores) ---")
    start_time = time.time()
    model.fit(X_train_raw, y_train)
    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f} seconds.")

    # Evaluation
    y_pred = model.predict(X_test_raw)
    y_proba = model.predict_proba(X_test_raw)[:, 1]

    print("\n--- Model Evaluation on Test Data ---")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Loss (0)', 'Win (1)']))

    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix (Rows=Actual, Cols=Predicted):")
    print(conf_matrix)

    try:
        auc_score = roc_auc_score(y_test, y_proba)
        print(f"AUC Score: {auc_score:.4f}")
    except ValueError:
        print("AUC Score requires at least one sample of each class in the test set.")

    # Calculate Precision for the Win class (index 1)
    tp = conf_matrix[1, 1]
    fp = conf_matrix[0, 1]
    win_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    print(f"\nPrecision (Win Class): {win_precision:.4f}")

    # Save the model
    model_filepath = 'squeeze_classifier_model.pkl'
    joblib.dump(model, model_filepath)
    print(f"\nModel saved successfully as '{model_filepath}' in your Colab environment.")

    return model, win_precision

# --- END: MODEL TRAINING LOGIC ---

if __name__ == "__main__":

    # ----------------------------------------------------------------
    # 1. SETUP & DOWNLOAD DATA
    # ----------------------------------------------------------------

    # This function handles the shell command for download and unzip
    DATA_DIRECTORY = setup_colab_environment(DATA_URL)

    if DATA_DIRECTORY is None:
        print("\nFATAL ERROR: Could not set up data directory. Cannot proceed.")
    else:
        # ----------------------------------------------------------------
        # 2. DATA GENERATION PIPELINE
        # ----------------------------------------------------------------

        timeframes_to_test = [3, 5, 15, 30, 60, 1440] # 1440 = 1 Day
        output_path = "training_dataset.csv"
        primary_tf = min(timeframes_to_test)

        # Setup Output File and Header
        if os.path.exists(output_path):
             os.remove(output_path) # Delete old file to ensure clean run
             print(f"Existing file '{output_path}' deleted for fresh generation.")

        header_df = pd.DataFrame(columns=[
            'symbol', 'timeframe', 'date', 'direction', 'rvol',
            'bb_width', 'kc_width', 'close_bb_dist', 'rsi', 'adx',
            'dist_from_sma_50', 'dist_from_sma_200', 'vwap_diff',
            'is_5min_sqz', 'is_15min_sqz', 'is_30min_sqz', 'is_60min_sqz', 'is_1D_sqz',
            'outcome'
        ])
        header_df.to_csv(output_path, index=False)
        print(f"\nStarting data generation. Signals generated only on {primary_tf}min with MTF context.")

        all_symbols = load_symbols_from_dir(DATA_DIRECTORY)
        if not all_symbols:
            print("No stock data files found in the specified directory.")

        total_data_points = 0

        # Parallel Processing
        start_gen_time = time.time()
        with ProcessPoolExecutor(max_workers=None) as executor:

            future_to_symbol = {
                executor.submit(process_single_symbol, symbol, DATA_DIRECTORY, timeframes_to_test): symbol
                for symbol in all_symbols
            }

            for future in as_completed(future_to_symbol):
                symbol = future_to_symbol[future]
                try:
                    symbol_result_df = future.result()

                    if symbol_result_df is not None and not symbol_result_df.empty:
                        symbol_result_df.to_csv(output_path, mode='a', header=False, index=False)
                        total_data_points += len(symbol_result_df)
                        print(f"--- COMPLETE: {symbol} data ({len(symbol_result_df)} points) appended. Total: {total_data_points} ---")
                    # else: print(f"--- COMPLETE: {symbol} generated no training data. ---")

                except Exception as exc:
                    print(f"!!! Major error processing symbol {symbol}: {exc}")

        end_gen_time = time.time()
        print(f"\n--- Data generation complete in {end_gen_time - start_gen_time:.2f} seconds. ---")
        print(f"Total data points saved to {output_path}: {total_data_points}")

        # ----------------------------------------------------------------
        # 3. MODEL TRAINING PIPELINE
        # ----------------------------------------------------------------

        if total_data_points > 100:
            data = pd.read_csv(output_path)
            data['direction'] = data['direction'].astype(str)

            X_train_raw, X_test_raw, y_train, y_test, preprocessor = preprocess_and_split(data)

            # Train and Evaluate
            train_and_evaluate(X_train_raw, X_test_raw, y_train, y_test, preprocessor)
        else:
            print("\nSkipping training: Insufficient data points generated.")

Downloading data from URL...
Unzipping data into /content/NSE_STOCK_DATA...
Error during unzipping: File is not a zip file

FATAL ERROR: Could not set up data directory. Cannot proceed.


In [None]:
print("Training Completed")

In [None]:
import pandas as pd
import numpy as np
import os
import re
import time
import zipfile
from concurrent.futures import ProcessPoolExecutor, as_completed

# --- COLAB SETUP & DATA DOWNLOAD ---
# Data URL provided by the user
# NOTE: YOU MUST REPLACE THIS WITH A FRESHLY GENERATED DOWNLOAD LINK FROM KAGGLE
DATA_URL = "https://storage.googleapis.com/kaggle-data-sets/2575525/12691112/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20251002%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20251002T080620Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=0c80f67373b4d541a02b9e9b2585185813fcbdffd2583841fc53206bdcea6f9b013f96313831630a1b93732f8664ec7ae92bb665ab25d00c707b4740a95d262e983ab4fc168fa23a5d62c27146b710c65c44385c486d194fb8385869ddc2a040d0a1269840c3996448f1be1ed8a735838fb94037b6862eb71438acb5e7a0a5844a57996c2c79bec966e025ab3093f2ce91f6a652b593c98f1dbb2ce86e46b9511cf5a7282f833fc8f3aed4c3b1224eb68599dbb7a8eee4319e2b488894eb1e7370338888d37987656571b9d62cab65e0118689d3e021ee7f74032b7466b7cdebba8a79e57d07f3cd341bb84367baf5a5680dc63c514627989c38140f1977a5eb"

def setup_colab_environment(url, zip_name="data_archive.zip", data_dir="/content/NSE_STOCK_DATA"):
    """Downloads data, unzips it, and creates the target directory structure."""

    # Use curl with -L (follow redirects) for better handling of signed URLs
    print(f"Downloading data from URL using curl...")
    # Execute the download command
    download_command = f"curl -L -o {zip_name} \"{url}\""
    return_code = os.system(download_command)

    if return_code != 0 or not os.path.exists(zip_name):
        print(f"\nERROR: Download command failed (Curl exit code {return_code}). The signed URL may have expired.")

    # Create the directory to extract data into
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # Unzip the file
    print(f"\nUnzipping data into {data_dir}...")
    try:
        if not os.path.exists(zip_name) or not zipfile.is_zipfile(zip_name):
             # This means the URL was likely expired and downloaded an HTML error page.
             print("CRITICAL ERROR: File downloaded but is not a valid ZIP file. The download URL likely expired.")
             return None

        with zipfile.ZipFile(zip_name, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        print("Download and extraction complete.")
        # Clean up the downloaded zip file after successful extraction
        os.remove(zip_name)
    except Exception as e:
        print(f"Error during unzipping: {e}")
        return None

    # Search for files in the extracted directory to confirm path
    # Colab path: /content/NSE_STOCK_DATA
    # If the zip extracted to a subdirectory (e.g., /content/NSE_STOCK_DATA/bundle/archive)
    # we need to adjust the path to the actual location of the CSV files.

    # Simple check for the existence of files like '*_minute.csv'
    if not any(f.endswith('_minute.csv') for f in os.listdir(data_dir)):
        # If no files found, try traversing one level deeper (common Kaggle/Google Storage behavior)
        print("CSV files not found directly in the target folder. Searching deeper...")
        for root, dirs, files in os.walk(data_dir):
            if any(f.endswith('_minute.csv') for f in files):
                data_dir = root
                print(f"Found CSV files in new directory: {data_dir}")
                break

    return data_dir

# --- START: DATA GENERATION LOGIC (Combined from generate_dataset_optimized.py) ---
# ... (All other functions are unchanged)

def calculate_rsi(df, window=14):
    """Calculates the Relative Strength Index (RSI)."""
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    RS = np.where(loss == 0, np.inf, gain / loss)
    df['rsi'] = 100 - (100 / (1 + RS))
    return df

def calculate_adx(df, window=14):
    """Calculates the Average Directional Index (ADX)."""
    df['high_low'] = df['high'] - df['low']
    df['high_prev_close'] = abs(df['high'] - df['close'].shift(1))
    df['low_prev_close'] = abs(df['low'] - df['close'].shift(1))
    df['tr'] = df[['high_low', 'high_prev_close', 'low_prev_close']].max(axis=1)

    df['plus_dm'] = np.where((df['high'] > df['high'].shift(1)) & (df['high'] - df['high'].shift(1) > df['low'].shift(1) - df['low']), df['high'] - df['high'].shift(1), 0)
    df['minus_dm'] = np.where((df['low'].shift(1) > df['low']) & (df['low'].shift(1) - df['low'] > df['high'] - df['high'].shift(1)), df['low'].shift(1) - df['low'], 0)

    temp_atr = df['tr'].ewm(alpha=1/window, adjust=False).mean()
    df['plus_di'] = (df['plus_dm'].ewm(alpha=1/window, adjust=False).mean() / temp_atr) * 100
    df['minus_di'] = (df['minus_dm'].ewm(alpha=1/window, adjust=False).mean() / temp_atr) * 100

    df['dx'] = abs(df['plus_di'] - df['minus_di']) / (df['plus_di'] + df['minus_di']) * 100
    df['adx'] = df['dx'].ewm(alpha=1/window, adjust=False).mean()

    df.drop(columns=['high_low', 'high_prev_close', 'low_prev_close', 'tr', 'plus_dm', 'minus_dm', 'plus_di', 'minus_di', 'dx'], inplace=True, errors='ignore')
    return df

def calculate_vwap(df):
    """Calculates the Volume Weighted Average Price (VWAP) for the entire dataset."""
    df['typical_price'] = (df['high'] + df['low'] + df['close']) / 3
    df['tpv'] = df['typical_price'] * df['volume']

    # Calculate VWAP cumulatively (VWAP usually resets daily, but simplified here)
    df['cum_tpv'] = df['tpv'].cumsum()
    df['cum_volume'] = df['volume'].cumsum()
    df['vwap'] = df['cum_tpv'] / df['cum_volume']

    df.drop(columns=['typical_price', 'tpv', 'cum_tpv', 'cum_volume'], inplace=True, errors='ignore')
    return df

def calculate_indicators(df, atr_period=14, bb_period=20, kc_period=20, kc_multiplier=2.0):
    """Calculates TTM Squeeze, RVOL, RSI, ADX, SMA distances, and VWAP."""

    # ATR Calculation (Must be first for KC and Normalization)
    df['high_low'] = df['high'] - df['low']
    df['high_prev_close'] = abs(df['high'] - df['close'].shift(1))
    df['low_prev_close'] = abs(df['low'] - df['close'].shift(1))
    df['tr'] = df[['high_low', 'high_prev_close', 'low_prev_close']].max(axis=1)
    df['atr'] = df['tr'].ewm(alpha=1/atr_period, adjust=False).mean()

    # Bollinger Bands
    df['bb_sma'] = df['close'].rolling(window=bb_period).mean()
    df['bb_std'] = df['close'].rolling(window=bb_period).std()
    df['bb_upper'] = df['bb_sma'] + (df['bb_std'] * 2)
    df['bb_lower'] = df['bb_sma'] - (df['bb_std'] * 2)

    # Keltner Channels
    df['kc_sma'] = df['close'].rolling(window=kc_period).mean()
    df['kc_upper'] = df['kc_sma'] + (df['atr'] * kc_multiplier)
    df['kc_lower'] = df['kc_sma'] - (df['atr'] * kc_multiplier)

    # RVOL
    df['avg_volume'] = df['volume'].rolling(window=20).mean()
    df['rvol'] = df['volume'] / df['avg_volume']

    # Squeeze Status
    df['squeeze_on'] = (df['bb_lower'] > df['kc_lower']) & (df['bb_upper'] < df['kc_upper'])

    # NEW: Other Indicators
    df = calculate_rsi(df)
    df = calculate_adx(df)
    df = calculate_vwap(df)

    # SMA Distance Calculation
    df['sma_50'] = df['close'].rolling(window=50).mean()
    df['sma_200'] = df['close'].rolling(window=200).mean()

    df['dist_from_sma_50'] = (df['close'] - df['sma_50']) / df['atr']
    df['dist_from_sma_200'] = (df['close'] - df['sma_200']) / df['atr']

    # Drop intermediate columns
    df.drop(['high_low', 'high_prev_close', 'low_prev_close', 'tr',
             'avg_volume', 'bb_sma', 'bb_std', 'kc_sma', 'atr',
             'sma_50', 'sma_200'],
            axis=1, inplace=True, errors='ignore')

    return df

def load_and_resample_data(directory, symbol, timeframe_minutes):
    """Loads and resamples 1-minute data, returning None on error."""
    filename = f"{symbol}_minute.csv"
    filepath = os.path.join(directory, filename)
    if not os.path.exists(filepath):
        # Colab's default unzip behavior might put files directly in /content
        alt_filepath = os.path.join('/content', filename)
        if os.path.exists(alt_filepath):
             filepath = alt_filepath
        else:
            # print(f"Data file not found: {filepath}")
            return None

    # Use low_memory=False for better performance on larger CSVs
    df = pd.read_csv(filepath, index_col='date', parse_dates=True, low_memory=False)

    if timeframe_minutes == 1440:
        resample_period = '1D'
    else:
        resample_period = f'{timeframe_minutes}min'

    resampled_df = df.resample(resample_period).agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum'
    }).dropna()

    return resampled_df

def generate_training_data(primary_df, mtf_dfs, symbol, primary_timeframe_min):
    """
    Generates training data for the primary timeframe (3min), looking up MTF context.
    """
    training_data = []

    # Identify MTF contexts (Timeframes other than the primary one)
    mtf_timeframes = sorted([tf for tf in mtf_dfs.keys() if tf != primary_timeframe_min])

    for i in range(200, len(primary_df) - 1): # Start after 200 bars to ensure all indicators are populated
        signal = False
        direction = None

        was_in_squeeze = primary_df['squeeze_on'].iloc[i-1]
        is_in_squeeze = primary_df['squeeze_on'].iloc[i]

        current_close = primary_df['close'].iloc[i]
        current_vwap = primary_df['vwap'].iloc[i]

        # 1. TTM Squeeze Breakout Check
        is_bullish_breakout = was_in_squeeze and not is_in_squeeze and current_close > primary_df['bb_upper'].iloc[i]
        is_bearish_breakout = was_in_squeeze and not is_in_squeeze and current_close < primary_df['bb_lower'].iloc[i]

        # 2. RVOL Filter
        rvol_ok = primary_df['rvol'].iloc[i] > 2

        # 3. VWAP Filter
        vwap_ok = False
        if is_bullish_breakout:
            vwap_ok = current_close > current_vwap
            if vwap_ok:
                signal = True
                direction = 'long'
        elif is_bearish_breakout:
            vwap_ok = current_close < current_vwap
            if vwap_ok:
                signal = True
                direction = 'short'

        # Final Signal Check
        if signal and rvol_ok:

            # Capture base features
            features = {
                'symbol': symbol,
                'timeframe': primary_timeframe_min,
                'date': primary_df.index[i],
                'direction': direction,
                'rvol': primary_df['rvol'].iloc[i],
                'bb_width': primary_df['bb_upper'].iloc[i] - primary_df['bb_lower'].iloc[i],
                'kc_width': primary_df['kc_upper'].iloc[i] - primary_df['kc_lower'].iloc[i],
                'close_bb_dist': primary_df['close'].iloc[i] - primary_df['bb_upper'].iloc[i] if direction == 'long' else primary_df['close'].iloc[i] - primary_df['bb_lower'].iloc[i],
                'rsi': primary_df['rsi'].iloc[i],
                'adx': primary_df['adx'].iloc[i],
                'dist_from_sma_50': primary_df['dist_from_sma_50'].iloc[i],
                'dist_from_sma_200': primary_df['dist_from_sma_200'].iloc[i],
                'vwap_diff': primary_df['vwap'].iloc[i] - primary_df['close'].iloc[i]
            }

            # Capture MTF Confluence features
            current_time = primary_df.index[i]
            for tf_min in mtf_timeframes:
                tf_label = f'is_{tf_min}min_sqz' if tf_min != 1440 else 'is_1D_sqz'

                if tf_min == 1440:
                    lookup_time = current_time.normalize() # Start of the day
                else:
                    lookup_time = current_time.floor(f'{tf_min}min')

                # Look up the squeeze status in the higher timeframe DF
                try:
                    # Use index slicing for more robust lookup near floor time
                    mtf_bar = mtf_dfs[tf_min].loc[mtf_dfs[tf_min].index <= current_time].iloc[-1]
                    features[tf_label] = int(mtf_bar['squeeze_on'])
                except (KeyError, IndexError):
                    # If lookup fails or no earlier bar exists, treat as not in squeeze (0)
                    features[tf_label] = 0

            # Simulate the trade to get the outcome (label)
            # Check for sufficient future data (i+1 for entry, minimum i+2 for exit bar)
            if i + 1 >= len(primary_df):
                 continue # Skip if no entry bar is available

            entry_price = primary_df['open'].iloc[i+1]
            label = 0  # Default to loss

            # Risk/Reward (using the same 1:2 logic)
            if direction == 'long':
                risk = entry_price - primary_df['bb_lower'].iloc[i]
                stop_loss = entry_price - risk
                take_profit = entry_price + (2 * risk)
            else: # short
                risk = primary_df['bb_upper'].iloc[i] - entry_price
                stop_loss = entry_price + risk
                take_profit = entry_price - (2 * risk)

            # Look ahead to find the exit
            for j in range(i + 1, len(primary_df)):
                current_low = primary_df['low'].iloc[j]
                current_high = primary_df['high'].iloc[j]
                exit_found = False

                if direction == 'long':
                    if current_high >= take_profit:
                        label = 1
                        exit_found = True
                    elif current_low <= stop_loss:
                        label = 0
                        exit_found = True
                else: # short
                    if current_low <= take_profit:
                        label = 1
                        exit_found = True
                    elif current_high >= stop_loss:
                        label = 0
                        exit_found = True

                if exit_found:
                    features['outcome'] = label
                    training_data.append(features)
                    break

    return pd.DataFrame(training_data)

def process_single_symbol(symbol, directory, timeframes_to_test):
    """
    Loads ALL timeframes for a symbol, processes them, and generates training data
    from the PRIMARY timeframe (lowest TF) using the others for context.
    """
    primary_timeframe_min = min(timeframes_to_test)
    mtf_dfs = {}

    # print(f"--- START: Loading and calculating data for {symbol} ---")

    # 1. Load and Calculate Indicators for ALL Timeframes
    for timeframe in timeframes_to_test:
        df = load_and_resample_data(directory, symbol, timeframe)
        if df is not None:
            try:
                df = calculate_indicators(df)
                df.dropna(inplace=True)
                mtf_dfs[timeframe] = df
                # print(f"[{symbol}] Calculated features for {timeframe}min.")
            except Exception as e:
                print(f"Error calculating indicators for {symbol} ({timeframe}min): {e}")

    # 2. Generate Training Data (Only from the Primary TF)
    # Require at least one bar of each timeframe plus 200 bars for indicator lookback
    if primary_timeframe_min in mtf_dfs and len(mtf_dfs[primary_timeframe_min]) > 200:
        primary_df = mtf_dfs[primary_timeframe_min]
        try:
            symbol_dataset = generate_training_data(primary_df, mtf_dfs, symbol, primary_timeframe_min)

            if not symbol_dataset.empty:
                return symbol_dataset
            else:
                 pass # print(f"[{symbol}] Primary TF ({primary_timeframe_min}min) generated no data after filters.")
        except Exception as e:
             print(f"Error generating training data for {symbol}: {e}")

    return None

def load_symbols_from_dir(directory):
    """Extracts stock symbols from filenames in a directory."""
    csv_filenames = os.listdir(directory)
    pattern = r"(.*)_minute.csv"
    symbols = []
    for filename in csv_filenames:
        match = re.search(pattern, filename)
        if match:
            symbols.append(match.group(1))
    return symbols

# --- END: DATA GENERATION LOGIC ---

# --- START: MODEL TRAINING LOGIC (Combined from model_training.py) ---

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

def preprocess_and_split(data):
    """Handles time-series splitting, feature encoding, and scaling."""
    # 1. Define Features and Label
    X_features = [
        'rvol', 'bb_width', 'kc_width', 'close_bb_dist', 'rsi', 'adx',
        'dist_from_sma_50', 'dist_from_sma_200', 'vwap_diff',
        'is_5min_sqz', 'is_15min_sqz', 'is_30min_sqz', 'is_60min_sqz', 'is_1D_sqz'
    ]

    categorical_features = ['direction']
    all_features = X_features + categorical_features

    X = data[all_features]
    y = data['outcome']

    # 2. Time-Series Split (80% Train, 20% Test)
    test_size = 0.2
    split_index = int(len(X) * (1 - test_size))

    X_train_raw = X.iloc[:split_index]
    X_test_raw = X.iloc[split_index:]
    y_train = y.iloc[:split_index]
    y_test = y.iloc[split_index:]

    # 3. Preprocessing Pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='drop'
    )

    return X_train_raw, X_test_raw, y_train, y_test, preprocessor

def train_and_evaluate(X_train_raw, X_test_raw, y_train, y_test, preprocessor):
    """Trains a Random Forest Classifier and evaluates its performance."""

    # Scikit-learn Random Forest Classifier optimized for CPU cores (n_jobs=-1)
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=500,
            max_depth=15,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1               # Use all CPU cores!
        ))
    ])

    print("\n--- Starting Model Training (Leveraging all Colab CPU cores) ---")
    start_time = time.time()
    model.fit(X_train_raw, y_train)
    end_time = time.time()
    print(f"Training complete in {end_time - start_time:.2f} seconds.")

    # Evaluation
    y_pred = model.predict(X_test_raw)
    y_proba = model.predict_proba(X_test_raw)[:, 1]

    print("\n--- Model Evaluation on Test Data ---")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Loss (0)', 'Win (1)']))

    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix (Rows=Actual, Cols=Predicted):")
    print(conf_matrix)

    try:
        auc_score = roc_auc_score(y_test, y_proba)
        print(f"AUC Score: {auc_score:.4f}")
    except ValueError:
        print("AUC Score requires at least one sample of each class in the test set.")

    # Calculate Precision for the Win class (index 1)
    tp = conf_matrix[1, 1]
    fp = conf_matrix[0, 1]
    win_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    print(f"\nPrecision (Win Class): {win_precision:.4f}")

    # Save the model
    model_filepath = 'squeeze_classifier_model.pkl'
    joblib.dump(model, model_filepath)
    print(f"\nModel saved successfully as '{model_filepath}' in your Colab environment.")

    return model, win_precision

# --- END: MODEL TRAINING LOGIC ---

if __name__ == "__main__":

    # ----------------------------------------------------------------
    # 1. SETUP & DOWNLOAD DATA
    # ----------------------------------------------------------------

    # This function handles the shell command for download and unzip
    DATA_DIRECTORY = setup_colab_environment(DATA_URL)

    if DATA_DIRECTORY is None:
        print("\nFATAL ERROR: Could not set up data directory. Cannot proceed.")
    else:
        # ----------------------------------------------------------------
        # 2. DATA GENERATION PIPELINE
        # ----------------------------------------------------------------

        timeframes_to_test = [3, 5, 15, 30, 60, 1440] # 1440 = 1 Day
        output_path = "training_dataset.csv"
        primary_tf = min(timeframes_to_test)

        # Setup Output File and Header
        if os.path.exists(output_path):
             os.remove(output_path) # Delete old file to ensure clean run
             print(f"Existing file '{output_path}' deleted for fresh generation.")

        header_df = pd.DataFrame(columns=[
            'symbol', 'timeframe', 'date', 'direction', 'rvol',
            'bb_width', 'kc_width', 'close_bb_dist', 'rsi', 'adx',
            'dist_from_sma_50', 'dist_from_sma_200', 'vwap_diff',
            'is_5min_sqz', 'is_15min_sqz', 'is_30min_sqz', 'is_60min_sqz', 'is_1D_sqz',
            'outcome'
        ])
        header_df.to_csv(output_path, index=False)
        print(f"\nStarting data generation. Signals generated only on {primary_tf}min with MTF context.")

        all_symbols = load_symbols_from_dir(DATA_DIRECTORY)
        if not all_symbols:
            print("No stock data files found in the specified directory.")

        total_data_points = 0

        # Parallel Processing
        start_gen_time = time.time()
        with ProcessPoolExecutor(max_workers=None) as executor:

            future_to_symbol = {
                executor.submit(process_single_symbol, symbol, DATA_DIRECTORY, timeframes_to_test): symbol
                for symbol in all_symbols
            }

            for future in as_completed(future_to_symbol):
                symbol = future_to_symbol[future]
                try:
                    symbol_result_df = future.result()

                    if symbol_result_df is not None and not symbol_result_df.empty:
                        symbol_result_df.to_csv(output_path, mode='a', header=False, index=False)
                        total_data_points += len(symbol_result_df)
                        print(f"--- COMPLETE: {symbol} data ({len(symbol_result_df)} points) appended. Total: {total_data_points} ---")
                    # else: print(f"--- COMPLETE: {symbol} generated no training data. ---")

                except Exception as exc:
                    print(f"!!! Major error processing symbol {symbol}: {exc}")

        end_gen_time = time.time()
        print(f"\n--- Data generation complete in {end_gen_time - start_gen_time:.2f} seconds. ---")
        print(f"Total data points saved to {output_path}: {total_data_points}")

        # ----------------------------------------------------------------
        # 3. MODEL TRAINING PIPELINE
        # ----------------------------------------------------------------

        if total_data_points > 100:
            data = pd.read_csv(output_path)
            data['direction'] = data['direction'].astype(str)

            X_train_raw, X_test_raw, y_train, y_test, preprocessor = preprocess_and_split(data)

            # Train and Evaluate
            train_and_evaluate(X_train_raw, X_test_raw, y_train, y_test, preprocessor)
        else:
            print("\nSkipping training: Insufficient data points generated.")


Downloading data from URL using curl...

Unzipping data into /content/NSE_STOCK_DATA...
Download and extraction complete.

Starting data generation. Signals generated only on 3min with MTF context.
--- COMPLETE: PAYTM data (492 points) appended. Total: 492 ---
--- COMPLETE: STARHEALTH data (408 points) appended. Total: 900 ---
--- COMPLETE: NEWGEN data (713 points) appended. Total: 1613 ---
--- COMPLETE: GRAPHITE data (1208 points) appended. Total: 2821 ---
--- COMPLETE: GRANULES data (1267 points) appended. Total: 4088 ---
--- COMPLETE: PFIZER data (958 points) appended. Total: 5046 ---
--- COMPLETE: DOMS data (175 points) appended. Total: 5221 ---
--- COMPLETE: ALKEM data (869 points) appended. Total: 6090 ---
--- COMPLETE: BHARTIARTL data (873 points) appended. Total: 6963 ---
--- COMPLETE: CIEINDIA data (1024 points) appended. Total: 7987 ---
--- COMPLETE: SUPREMEIND data (865 points) appended. Total: 8852 ---
--- COMPLETE: BALRAMCHIN data (1333 points) appended. Total: 10185 ---
-

⏰# New Section AFTER COMPLETION

In [2]:
import os
import zipfile
import re
import numpy as np
import pandas as pd
import joblib
import xgboost as xgb
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# --- Colab Setup and Data Configuration ---
# 1. Google Drive Path for permanent storage

# 2. **REQUIRED: Update this with a fresh, unexpired link from the Kaggle dataset page**
# NOTE: This link is only used if the data is NOT found in Google Drive.
DATA_URL = "https://storage.googleapis.com/kaggle-data-sets/2575525/12691112/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20251001%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20251001T235419Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=0f9c7fd81c9801a691e9bed7879114c4c122987a0ed10445403f0dae54d50d333adaa1d0bf0af42b95facb7860a6d3d61a866409b2574ba9223c3c5ae97aea974508e75e9414c6ab9f021e4e30b58b642902734bd4641f9592967f5caa4e92d7e521481da9595f4a2b30c0d6b6dd48a8e07d357386ae3594335aa2255f2c7862398e7d11086c46cca0e9698bdcbca9ae6b0a583f076699989cfbaef2e5486873e97a01f029f8545f502354ce75139f01362260f543101ed6f32632452054831e820c7e67846d4800d69092b482c9a8992f1b509c99d4774c8e419c8c97b61b25f0ae591881ee45cd4e2b55779ba8ad98d7fd2a9cf3619926937851939e0dad53"
DATA_DIR="/content/NSE_STOCK_DATA" #/content/NSE_STOCK_DATA
# --- Strategy Parameters ---
# Primary timeframe for signal generation. Features are looked up relative to this TF.
PRIMARY_TIMEFRAME = 3
# All timeframes to load, calculate indicators for, and use for MTF confluence lookup
ALL_TIMEFRAMES = [3, 5, 15, 30, 60, 1440] # 1440min = 1 Day
ATR_PERIOD = 14
BB_PERIOD = 20
KC_PERIOD = 20
KC_MULTIPLIER = 2.0
RVOL_THRESHOLD = 2.0 # Strict RVOL filter

# --- Helper Functions (Indicators) ---

def calculate_vwap(df):
    """Calculates Volume Weighted Average Price (VWAP) for the trading day."""
    df['date_only'] = df.index.date
    # Calculate typical price (High + Low + Close) / 3
    df['tp'] = (df['high'] + df['low'] + df['close']) / 3
    # Group by trading day
    daily_groups = df.groupby('date_only')

    # Calculate VWAP cumulatively for each day
    df['tp_vol'] = df['tp'] * df['volume']

    # Cumulative Sums for each day
    df['cumulative_tp_vol'] = daily_groups['tp_vol'].cumsum()
    df['cumulative_volume'] = daily_groups['volume'].cumsum()

    # VWAP = Cumulative(TP * Volume) / Cumulative(Volume)
    df['vwap'] = df['cumulative_tp_vol'] / df['cumulative_volume']

    df.drop(columns=['date_only', 'tp', 'tp_vol', 'cumulative_tp_vol', 'cumulative_volume'], inplace=True)
    return df

def calculate_rsi(df, period=14):
    """Calculates Relative Strength Index (RSI)."""
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).ewm(span=period, min_periods=period).mean()
    loss = (-delta.where(delta < 0, 0)).ewm(span=period, min_periods=period).mean()
    rs = gain / loss
    df['rsi'] = 100 - (100 / (1 + rs))
    return df

def calculate_adx(df, period=14):
    """Calculates Average Directional Index (ADX)."""
    # Calculate True Range (TR)
    high_minus_low = df['high'] - df['low']
    high_minus_prev_close = abs(df['high'] - df['close'].shift(1))
    low_minus_prev_close = abs(df['low'] - df['close'].shift(1))
    df['tr'] = pd.DataFrame({'a': high_minus_low, 'b': high_minus_prev_close, 'c': low_minus_prev_close}).max(axis=1)

    # Calculate ATR (Average True Range)
    df['atr'] = df['tr'].ewm(span=period, adjust=False).mean()

    # Calculate Directional Movement (+DM and -DM)
    high_diff = df['high'].diff()
    low_diff = df['low'].diff()

    df['dm_plus'] = np.where(high_diff > low_diff, np.maximum(high_diff, 0), 0)
    df['dm_minus'] = np.where(low_diff > high_diff, np.maximum(low_diff, 0), 0)

    # Calculate Smoothed Directional Movement
    df['di_plus'] = (df['dm_plus'].ewm(span=period, adjust=False).mean() / df['atr']) * 100
    df['di_minus'] = (df['dm_minus'].ewm(span=period, adjust=False).mean() / df['atr']) * 100

    # Calculate Directional Index (DX)
    df['dx'] = abs(df['di_plus'] - df['di_minus']) / (df['di_plus'] + df['di_minus']) * 100

    # Calculate ADX (Average Directional Index)
    df['adx'] = df['dx'].ewm(span=period, adjust=False).mean()

    df.drop(columns=['tr', 'dm_plus', 'dm_minus', 'di_plus', 'di_minus', 'dx'], inplace=True, errors='ignore')
    return df

def calculate_indicators(df, atr_period=ATR_PERIOD, bb_period=BB_PERIOD, kc_period=KC_PERIOD, kc_multiplier=KC_MULTIPLIER):
    """
    Calculates primary indicators including ATR, BB, KC, RVOL, RSI, ADX, and SMAs.
    """
    # 1. True Range and ATR (Needed for KC and Normalization)
    df['high_low'] = df['high'] - df['low']
    df['high_prev_close'] = abs(df['high'] - df['close'].shift(1))
    df['low_prev_close'] = abs(df['low'] - df['close'].shift(1))
    df['tr'] = df[['high_low', 'high_prev_close', 'low_prev_close']].max(axis=1)
    df['atr'] = df['tr'].ewm(alpha=1/atr_period, adjust=False).mean()

    # 2. Bollinger Bands (BB)
    df['bb_sma'] = df['close'].rolling(window=bb_period).mean()
    df['bb_std'] = df['close'].rolling(window=bb_period).std()
    df['bb_upper'] = df['bb_sma'] + (df['bb_std'] * 2)
    df['bb_lower'] = df['bb_sma'] - (df['bb_std'] * 2)

    # 3. Keltner Channels (KC)
    df['kc_sma'] = df['close'].rolling(window=kc_period).mean()
    df['kc_upper'] = df['kc_sma'] + (df['atr'] * kc_multiplier)
    df['kc_lower'] = df['kc_sma'] - (df['atr'] * kc_multiplier)

    # 4. Squeeze Status
    df['squeeze_on'] = (df['bb_lower'] > df['kc_lower']) & (df['bb_upper'] < df['kc_upper'])

    # 5. Relative Volume (RVOL)
    df['avg_volume'] = df['volume'].rolling(window=20).mean()
    df['rvol'] = df['volume'] / df['avg_volume']

    # 6. RSI and ADX
    df = calculate_rsi(df, period=14)
    df = calculate_adx(df, period=14)

    # 7. SMAs and Normalized Distance (Requires 'atr' to be calculated first)
    df['sma_50'] = df['close'].rolling(window=50).mean()
    df['sma_200'] = df['close'].rolling(window=200).mean()
    # Normalized distance from MA to close, normalized by ATR
    df['dist_from_sma_50'] = (df['close'] - df['sma_50']) / df['atr']
    df['dist_from_sma_200'] = (df['close'] - df['sma_200']) / df['atr']

    # 8. VWAP
    df = calculate_vwap(df)

    # Drop intermediate columns
    cols_to_drop = ['high_low', 'high_prev_close', 'low_prev_close', 'tr', 'avg_volume', 'bb_sma', 'bb_std', 'kc_sma', 'sma_50', 'sma_200']
    df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True, errors='ignore')

    return df

# --- Data Loading and Resampling ---

def load_and_resample_data(filepath, timeframe_minutes):
    """
    Loads 1-minute data and resamples it to the specified timeframe.
    """
    try:
        # Load 1-minute data
        df = pd.read_csv(filepath, index_col='date', parse_dates=True)

        # Resample to the target timeframe
        resample_period = f'{timeframe_minutes}min'
        resampled_df = df.resample(resample_period).agg({
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last',
            'volume': 'sum'
        }).dropna()

        return resampled_df
    except Exception as e:
        print(f"Error loading/resampling {filepath} to {timeframe_minutes}min: {e}")
        return None

# --- Training Data Generation (Core Logic) ---

def generate_training_data(df, symbol, timeframe, mtf_data):
    """
    Generates a DataFrame of training data points incorporating MTF and VWAP filters.
    """
    training_data = []

    # Get the higher timeframe DataFrames for MTF lookup
    df_5m = mtf_data.get(5)
    df_15m = mtf_data.get(15)
    df_30m = mtf_data.get(30)
    df_60m = mtf_data.get(60)
    df_1D = mtf_data.get(1440)

    # Note: df is the PRIMARY_TIMEFRAME (e.g., 3m)

    for i in range(1, len(df)):

        # --- Squeeze and Breakout Detection ---
        signal = False
        direction = None

        was_in_squeeze = df['squeeze_on'].iloc[i-1]
        is_in_squeeze = df['squeeze_on'].iloc[i]

        current_date = df.index[i]

        # Bullish Breakout
        if was_in_squeeze and not is_in_squeeze and df['close'].iloc[i] > df['bb_upper'].iloc[i]:
            direction = 'long'
        # Bearish Breakout
        elif was_in_squeeze and not is_in_squeeze and df['close'].iloc[i] < df['bb_lower'].iloc[i]:
            direction = 'short'

        # --- Combined Entry Filters (RVOL & VWAP) ---
        if direction:
            # 1. RVOL > 2.0 Filter
            rvol_ok = df['rvol'].iloc[i] >= RVOL_THRESHOLD

            # 2. VWAP Filter
            if direction == 'long':
                vwap_ok = df['close'].iloc[i] > df['vwap'].iloc[i]
            else: # short
                vwap_ok = df['close'].iloc[i] < df['vwap'].iloc[i]

            signal = rvol_ok and vwap_ok

        # --- Final Signal and Trade Simulation ---
        if signal and i + 1 < len(df):

            # --- MTF Feature Lookup ---

            def get_mtf_squeeze(high_df, current_time):
                if high_df is not None and not high_df.empty:
                    # Use searchsorted for efficient time-based lookup
                    idx = high_df.index.searchsorted(current_time, side='right') - 1
                    if idx >= 0 and idx < len(high_df):
                        return high_df['squeeze_on'].iloc[idx].astype(int)
                return 0 # Default to 0 (False) if data is missing or lookup fails

            mtf_sqz_5m = get_mtf_squeeze(df_5m, current_date)
            mtf_sqz_15m = get_mtf_squeeze(df_15m, current_date)
            mtf_sqz_30m = get_mtf_squeeze(df_30m, current_date)
            mtf_sqz_60m = get_mtf_squeeze(df_60m, current_date)
            mtf_sqz_1D = get_mtf_squeeze(df_1D, current_date)

            # Capture features at the time of the signal
            features = {
                'symbol': symbol,
                'timeframe': timeframe,
                'date': current_date,
                'direction': direction,
                'rvol': df['rvol'].iloc[i],
                'bb_width': df['bb_upper'].iloc[i] - df['bb_lower'].iloc[i],
                'kc_width': df['kc_upper'].iloc[i] - df['kc_lower'].iloc[i],
                # New Technical Features
                'rsi': df['rsi'].iloc[i],
                'adx': df['adx'].iloc[i],
                'dist_from_sma_50': df['dist_from_sma_50'].iloc[i],
                'dist_from_sma_200': df['dist_from_sma_200'].iloc[i],
                # New MTF Confluence Features
                'is_5min_sqz': mtf_sqz_5m,
                'is_15min_sqz': mtf_sqz_15m,
                'is_30min_sqz': mtf_sqz_30m,
                'is_60min_sqz': mtf_sqz_60m,
                'is_1D_sqz': mtf_sqz_1D,
                # VWAP Distance (Normalized)
                'dist_from_vwap': (df['close'].iloc[i] - df['vwap'].iloc[i]) / df['atr'].iloc[i]
            }

            # Simulate the trade (1:2 Risk/Reward) to get the outcome (label)
            entry_price = df['open'].iloc[i+1]
            label = 0  # Default to loss

            # Calculate risk, SL, TP based on breakout candle's ATR
            risk = df['atr'].iloc[i] # Use ATR of the signal bar as risk unit

            if direction == 'long':
                stop_loss = entry_price - risk
                take_profit = entry_price + (2 * risk)
            else: # short
                stop_loss = entry_price + risk
                take_profit = entry_price - (2 * risk)

            # Look ahead to find the exit (using next bar's high/low)
            for j in range(i + 1, len(df)):
                current_low = df['low'].iloc[j]
                current_high = df['high'].iloc[j]
                exit_found = False

                if direction == 'long':
                    if current_high >= take_profit:
                        label = 1 # Success (Target hit)
                        exit_found = True
                    elif current_low <= stop_loss:
                        label = 0 # Failure (Stop Loss hit)
                        exit_found = True
                else: # short
                    if current_low <= take_profit:
                        label = 1 # Success (Target hit)
                        exit_found = True
                    elif current_high >= stop_loss:
                        label = 0 # Failure (Stop Loss hit)
                        exit_found = True

                if exit_found:
                    features['outcome'] = label
                    training_data.append(features)
                    break

    return pd.DataFrame(training_data)

# --- Parallel Processing Worker ---

def process_single_symbol(symbol, data_directory):
    """
    Worker function to process a single symbol across all timeframes and generate data.
    Returns a list of DataFrames (one per timeframe).
    """
    # print(f"Starting processing for {symbol}...") # Suppressing print for cleaner output

    filename = f"{symbol}_minute.csv"
    filepath = os.path.join(data_directory, filename)

    if not os.path.exists(filepath):
        # print(f"Skipping {symbol}: Data file not found.")
        return []

    # 1. Load and Prepare ALL Timeframes for MTF Lookups
    mtf_data = {}
    for tf in ALL_TIMEFRAMES:
        df = load_and_resample_data(filepath, tf)
        if df is not None and not df.empty:
            df = calculate_indicators(df)
            df.dropna(inplace=True)
            mtf_data[tf] = df
        else:
             pass # Suppressing print: Warning: Missing or empty data for {symbol} on {tf}min.

    # 2. Generate Training Data (Only use the primary timeframe for signaling)
    if PRIMARY_TIMEFRAME not in mtf_data:
        # print(f"Skipping {symbol}: Primary TF ({PRIMARY_TIMEFRAME}min) data is missing.")
        return []

    df_primary = mtf_data[PRIMARY_TIMEFRAME]

    # This function uses df_primary for signals, and mtf_data for lookups
    symbol_dataset = generate_training_data(df_primary, symbol, PRIMARY_TIMEFRAME, mtf_data)

    if not symbol_dataset.empty:
        # print(f"Completed {symbol}: Generated {len(symbol_dataset)} data points.")
        return [symbol_dataset]

    # print(f"Completed {symbol}: Generated 0 data points.")
    return []

# --- Colab and Setup Functions ---

def load_symbols_from_dir(directory):
    """
    Extracts stock symbols from filenames in a directory.
    """
    if not os.path.exists(directory):
        return []

    csv_filenames = [f for f in os.listdir(directory) if f.endswith(".csv")]
    pattern = r"(.*)_minute.csv"
    symbols = []
    for filename in csv_filenames:
        match = re.search(pattern, filename)
        if match:
            symbols.append(match.group(1))
    return symbols

def setup_colab_environment(data_url, data_dir):
    """Downloads and unzips data, checking Google Drive first."""

    # 1. Mount Google Drive
    #print("Attempting to mount Google Drive...")
    #from google.colab import drive
    #try:
    #    drive.mount('/content/drive', force_remount=True)
    #    print("Google Drive mounted successfully.")
    #except Exception as e:
    #    print(f"Error mounting Google Drive: {e}")

    # 2. Check if data exists in Drive (Persistent Storage)
    #if os.path.exists(data_dir) and len(load_symbols_from_dir(data_dir)) > 50:
    #    print(f"\n✅ Data already found in Google Drive: {data_dir}. Skipping download.")
    #    return True # Data is ready

    # 3. Data Not Found - Proceed to Download
    os.makedirs(data_dir, exist_ok=True)
    zip_path = os.path.join(data_dir, 'data_archive.zip')

    # Use 'curl -L' for robust downloading of signed URLs
    print(f"\n⏳ Data not found. Downloading ZIP from URL...")
    os.system(f"curl -L '{data_url}' -o '{zip_path}'")

    if not os.path.exists(zip_path) or os.path.getsize(zip_path) < 1024 * 1024:
        print("\nFATAL ERROR: Download failed or file is too small (likely an expired URL).")
        print("Please obtain a new, fresh download link from the Kaggle dataset page.")
        return False

    if not zipfile.is_zipfile(zip_path):
        print("\nFATAL ERROR: Downloaded file is not a zip file (likely an HTML error page from an expired URL).")
        print("Please obtain a new, fresh download link from the Kaggle dataset page.")
        os.remove(zip_path)
        return False

    print(f"Unzipping data into {data_dir}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)

    os.remove(zip_path)
    print("Download and extraction complete.")
    return True

# --- Main Execution Block ---

def generate_and_train_pipeline(data_url, data_dir, output_dir):
    """
    Full pipeline: Setup, Generation, Training, and Evaluation.
    """
    output_path = os.path.join(output_dir, "training_dataset.csv")
    model_output_path = os.path.join(output_dir, "squeeze_classifier_model.pkl")


    # 1. --- SETUP ENVIRONMENT & DATA DOWNLOAD/GENERATION (COMMENTED OUT FOR FASTER RETRAINING) ---

    if not setup_colab_environment(data_url, data_dir):
        return

    all_symbols = load_symbols_from_dir(data_dir)
    if not all_symbols:
        print(f"FATAL ERROR: No stock data files found in {data_dir}.")
        return
        print(f"\n--- Starting Parallel Data Generation for {len(all_symbols)} symbols ---")
        # Write header once before parallel processing starts
    header_cols = [
        'symbol', 'timeframe', 'date', 'direction', 'rvol', 'bb_width', 'kc_width',
        'rsi', 'adx', 'dist_from_sma_50', 'dist_from_sma_200', 'is_5min_sqz',
        'is_15min_sqz', 'is_30min_sqz', 'is_60min_sqz', 'is_1D_sqz',
        'dist_from_vwap', 'outcome'
    ]
    pd.DataFrame(columns=header_cols).to_csv(output_path, index=False)
    print(f"Created new training dataset file: {output_path}")

    total_data_points = 0
    start_time = datetime.now()
        # Use ProcessPoolExecutor to leverage all available CPU cores
    with ProcessPoolExecutor(max_workers=None) as executor:
        future_to_symbol = {executor.submit(process_single_symbol, symbol, data_dir): symbol for symbol in all_symbols}

        for future in as_completed(future_to_symbol):
            symbol = future_to_symbol[future]
            try:
                results = future.result()
                for df in results:
                    # Append results sequentially to prevent file corruption
                    df.to_csv(output_path, mode='a', header=False, index=False)
                    total_data_points += len(df)
            except Exception as e:
                print(f"!!! Error processing symbol {symbol}: {e}")

    end_time = datetime.now()
    print(f"\n--- Data generation complete in {end_time - start_time} ---")
    print(f"Total data points saved to {output_path}: {total_data_points}")

    # 2. --- MODEL TRAINING and EVALUATION ---

    print("\n--- Starting Model Training and Evaluation ---")

    # # We must remount the drive to ensure the training_dataset.csv is accessible
    # print("Attempting to remount Google Drive to access training data...")
    # try:
    #     from google.colab import drive
    #     drive.mount('/content/drive', force_remount=True)
    #     print("Drive remounted successfully.")
    # except Exception as e:
    #     print(f"Error remounting Google Drive: {e}. Cannot load data.")
    #     return

    # Load the existing generated data
    try:
        data = pd.read_csv(output_path)
    except Exception as e:
        #print(f"FATAL ERROR: Could not load training dataset from Drive at {output_path}. Did you run the generation step previously?")
        print(f"Error: {e}")
        return

    # Drop rows with NaN (which come from MTF lookups where higher TF data is missing)
    data.dropna(inplace=True)

    # Define features (X) and label (y)
    categorical_features = ['direction']
    numeric_features = [
        'rvol', 'bb_width', 'kc_width', 'rsi', 'adx',
        'dist_from_sma_50', 'dist_from_sma_200', 'dist_from_vwap',
        'is_5min_sqz', 'is_15min_sqz', 'is_30min_sqz', 'is_60min_sqz', 'is_1D_sqz'
    ]

    X_features = categorical_features + numeric_features
    X = data[X_features]
    y = data['outcome']

    # Ensure all MTF squeeze columns are treated as integer (0 or 1)
    for col in [c for c in X.columns if c.startswith('is_')]:
        X[col] = X[col].astype(int)

    # Preprocessing Pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )

    # Split data using time-based split (Train on earlier data, Test on later data)
    data.sort_values(by='date', inplace=True)
    train_size = int(0.8 * len(data))
    X_train = X.iloc[:train_size]
    X_test = X.iloc[train_size:]
    y_train = y.iloc[:train_size]
    y_test = y.iloc[train_size:]

    print(f"Loaded {len(data)} total data points.")
    print(f"Training set size: {len(X_train)} (80%)")
    print(f"Testing set size: {len(X_test)} (20%)")

    # Calculate the scale_pos_weight for XGBoost (Ratio of negative to positive examples)
    n_neg = (y_train == 0).sum()
    n_pos = (y_train == 1).sum()

    # Aggressively weight the positive class (Win) to boost Recall (CRITICAL for imbalanced data)
    # We are using a multiplier of 2.0 to aggressively force the model to predict 'Win' more often.
    scale_pos_weight = (n_neg / n_pos) * 2.0

    print(f"Positive samples (Win): {n_pos}, Negative samples (Loss): {n_neg}")
    print(f"Calculated scale_pos_weight (Aggressive): {scale_pos_weight:.2f}")

    # Define the Model (XGBoost Classifier) - Parameters tuned for high Recall
    xgb_model = xgb.XGBClassifier(
        objective='binary:logistic',
        n_estimators=300,           # Increased estimators for better fit
        max_depth=4,                # Slightly reduced depth for better generalization
        learning_rate=0.05,         # Slightly reduced learning rate
        use_label_encoder=False,
        eval_metric='logloss',
        # --- Imbalance/Regularization Tuning for high Recall ---
        scale_pos_weight=scale_pos_weight, # AGGRESSIVE WEIGHTING
        min_child_weight=5,                # Regularization to prevent overfitting to noise
        gamma=0.2,                         # Added gamma for more controlled regularization
        tree_method='hist',         # Faster training method
        random_state=42,
        n_jobs=-1                   # Use all CPU cores
    )

    # Full Pipeline: Preprocessor -> Model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', xgb_model)])

    print("\n--- Training XGBoost Classifier (Aggressively Tuned for Recall) ---")
    pipeline.fit(X_train, y_train)
    print("Training complete.")

    # 3. Evaluation
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    print("\n--- Model Evaluation on Test Data ---")

    report = classification_report(y_test, y_pred, target_names=['Loss (0)', 'Win (1)'], digits=4)
    print("Classification Report:")
    print(report)

    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix (Rows=Actual, Cols=Predicted):")
    print("[[True Negatives, False Positives]")
    print(" [False Negatives, True Positives]]")
    print(cm)

    accuracy = (cm[0, 0] + cm[1, 1]) / np.sum(cm)
    win_precision = cm[1, 1] / (cm[1, 1] + cm[0, 1]) if (cm[1, 1] + cm[0, 1]) > 0 else 0
    auc_score = roc_auc_score(y_test, y_proba)

    print(f"\nAccuracy: {accuracy:.4f} (Overall correct predictions)")
    print(f"AUC Score: {auc_score:.4f} (Measure of separability between classes)")
    print(f"\nPrecision (Win Class): {win_precision:.4f}")
    print("--> This is your model's estimated **Win Rate** among the trades it chooses to take.")

    # 4. Save Model
    joblib.dump(pipeline, model_output_path)
    print(f"\nModel saved successfully as '{model_output_path}'")

# Execute the pipeline
if __name__ == "__main__":
    generate_and_train_pipeline(DATA_URL, DATA_DIR, DATA_DIR)



--- Data generation complete in 1:16:56.086540 ---
Total data points saved to /content/NSE_STOCK_DATA/training_dataset.csv: 728268

--- Starting Model Training and Evaluation ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

Loaded 728268 total data points.
Training set size: 582614 (80%)
Testing set size: 145654 (20%)
Positive samples (Win): 182073, Negative samples (Loss): 400541
Calculated scale_pos_weight (Aggressive): 4.40

--- Training XGBoost Classifier (Aggressively Tuned for Recall) ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.

--- Model Evaluation on Test Data ---
Classification Report:
              precision    recall  f1-score   support

    Loss (0)     0.8679    0.0090    0.0178    100600
     Win (1)     0.3106    0.9969    0.4736     45054

    accuracy                         0.3146    145654
   macro avg     0.5893    0.5030    0.2457    145654
weighted avg     0.6955    0.3146    0.1588    145654


Confusion Matrix (Rows=Actual, Cols=Predicted):
[[True Negatives, False Positives]
 [False Negatives, True Positives]]
[[  907 99693]
 [  138 44916]]

Accuracy: 0.3146 (Overall correct predictions)
AUC Score: 0.5726 (Measure of separability between classes)

Precision (Win Class): 0.3106
--> This is your model's estimated **Win Rate** among the trades it chooses to take.

Model saved successfully as '/content/NSE_STOCK_DATA/squeeze_classifier_model.pkl'


In [4]:
print("DONEEEEEEEEEEE")

DONEEEEEEEEEEE
