# Notebook 2: Stock Data Collection & Model Training with Upstox API

## 1. Imports & Setup

In [1]:
# Standard library imports
import os
import json
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Data manipulation and analysis
import numpy as np
import pandas as pd

# API and HTTP requests
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning - Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit

# Machine Learning - Models
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Deep Learning (TensorFlow/Keras)
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional, GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

# Time series specific
from prophet import Prophet  # Facebook Prophet for comparison

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("All imports successful!")
print(f"TensorFlow version: {tf.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

2025-11-02 13:43:41.978276: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-02 13:43:43.049739: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-02 13:43:50.573432: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


All imports successful!
TensorFlow version: 2.20.0
Pandas version: 2.3.3
NumPy version: 2.2.6


In [2]:
# API Configuration
UPSTOX_ACCESS_TOKEN = os.getenv("UPSTOX_ACCESS_TOKEN")
BASE_URL = "https://api.upstox.com/v3/historical-candle"

In [3]:
# Simple tests to verify Upstox API access

# Fetch historical stock data

headers = {
    'Accept': 'application/json',
    'Authorization': f'Bearer {UPSTOX_ACCESS_TOKEN}'
    }

print("Fetching historical data for Reliance Industries...\n")

# Reliance Industries ISIN: INE002A01018
stock_url = 'https://api.upstox.com/v3/historical-candle/NSE_EQ%7CINE002A01018/days/1/2025-11-01/2025-10-01'

response = requests.get(stock_url, headers=headers)

if response.status_code == 200:
    print("SUCCESS! Got stock data!")
    data = response.json()
    
    if data['status'] == 'success':
        candles = data['data']['candles']
        print(f"\nReceived {len(candles)} days of data")
        print(f"\nSample data (last 3 days):")
        print(f"{'Date':<12} {'Open':<10} {'High':<10} {'Low':<10} {'Close':<10} {'Volume':<15}")
        print("-" * 70)
        
        for candle in candles[-3:]:
            date, open_p, high, low, close, volume, _ = candle
            print(f"{date[:10]:<12} {open_p:<10.2f} {high:<10.2f} {low:<10.2f} {close:<10.2f} {volume:<15,}")
else:
    print(f"ERROR: {response.status_code}")
    print(response.text)

Fetching historical data for Reliance Industries...

SUCCESS! Got stock data!

Received 21 days of data

Sample data (last 3 days):
Date         Open       High       Low        Close      Volume         
----------------------------------------------------------------------
2025-10-06   1360.00    1377.40    1359.00    1375.00    12,396,580     
2025-10-03   1363.20    1371.60    1356.90    1363.40    12,842,347     
2025-10-01   1367.00    1378.60    1362.70    1368.70    12,045,916     


In [4]:
# NIFTY 100 Stock List (as of November 2025)
# This represents the top 100 companies by market capitalization on NSE

nifty_100_stocks = [
    # NIFTY 50 components (Top 50)
    "RELIANCE",        # Reliance Industries Ltd
    "HDFCBANK",        # HDFC Bank Ltd
    "BHARTIARTL",      # Bharti Airtel Ltd
    "TCS",             # Tata Consultancy Services Ltd
    "ICICIBANK",       # ICICI Bank Ltd
    "SBIN",            # State Bank of India
    "BAJFINANCE",      # Bajaj Finance Ltd
    "INFY",            # Infosys Ltd
    "HINDUNILVR",      # Hindustan Unilever Ltd
    "LT",              # Larsen & Toubro Ltd
    "ITC",             # ITC Ltd
    "MARUTI",          # Maruti Suzuki India Ltd
    "M&M",             # Mahindra & Mahindra Ltd
    "KOTAKBANK",       # Kotak Mahindra Bank Ltd
    "HCLTECH",         # HCL Technologies Ltd
    "SUNPHARMA",       # Sun Pharmaceutical Industries Ltd
    "AXISBANK",        # Axis Bank Ltd
    "ULTRACEMCO",      # UltraTech Cement Ltd
    "ZOMATO",          # Zomato Ltd
    "BAJAJFINSV",      # Bajaj Finserv Ltd
    "NTPC",            # NTPC Ltd
    "LICI",            # Life Insurance Corporation of India
    "TITAN",           # Titan Company Ltd
    "ASIANPAINT",      # Asian Paints Ltd
    "TATAMOTORS",      # Tata Motors Ltd
    "ADANIENT",        # Adani Enterprises Ltd
    "WIPRO",           # Wipro Ltd
    "ONGC",            # Oil and Natural Gas Corporation Ltd
    "HDFCLIFE",        # HDFC Life Insurance Company Ltd
    "JSWSTEEL",        # JSW Steel Ltd
    "POWERGRID",       # Power Grid Corporation of India Ltd
    "ADANIPORTS",      # Adani Ports and Special Economic Zone Ltd
    "COALINDIA",       # Coal India Ltd
    "SBILIFE",         # SBI Life Insurance Company Ltd
    "TECHM",           # Tech Mahindra Ltd
    "TATASTEEL",       # Tata Steel Ltd
    "INDUSINDBK",      # IndusInd Bank Ltd
    "HINDALCO",        # Hindalco Industries Ltd
    "DIVISLAB",        # Divi's Laboratories Ltd
    "NESTLEIND",       # Nestle India Ltd
    "BRITANNIA",       # Britannia Industries Ltd
    "GRASIM",          # Grasim Industries Ltd
    "EICHERMOT",       # Eicher Motors Ltd
    "TRENT",           # Trent Ltd
    "BAJAJ-AUTO",      # Bajaj Auto Ltd
    "TATACONSUM",      # Tata Consumer Products Ltd
    "CIPLA",           # Cipla Ltd
    "BPCL",            # Bharat Petroleum Corporation Ltd
    "APOLLOHOSP",      # Apollo Hospitals Enterprise Ltd
    "DRREDDY",         # Dr. Reddy's Laboratories Ltd
    
    # NIFTY Next 50 components (Next 50 largest)
    "ADANIGREEN",      # Adani Green Energy Ltd
    "SIEMENS",         # Siemens Ltd
    "DLF",             # DLF Ltd
    "PIDILITIND",      # Pidilite Industries Ltd
    "JIOFIN",          # Jio Financial Services Ltd
    "GAIL",            # GAIL (India) Ltd
    "HAL",             # Hindustan Aeronautics Ltd
    "GODREJCP",        # Godrej Consumer Products Ltd
    "ADANIPOWER",      # Adani Power Ltd
    "IOC",             # Indian Oil Corporation Ltd
    "ABB",             # ABB India Ltd
    "VEDL",            # Vedanta Ltd
    "ICICIPRULI",      # ICICI Prudential Life Insurance Company Ltd
    "SHREECEM",        # Shree Cement Ltd
    "IRFC",            # Indian Railway Finance Corporation Ltd
    "CHOLAFIN",        # Cholamandalam Investment and Finance Company Ltd
    "AMBUJACEM",       # Ambuja Cements Ltd
    "SRF",             # SRF Ltd
    "BOSCHLTD",        # Bosch Ltd
    "MCDOWELL-N",      # United Spirits Ltd
    "DABUR",           # Dabur India Ltd
    "HAVELLS",         # Havells India Ltd
    "INDIGO",          # InterGlobe Aviation Ltd
    "BERGEPAINT",      # Berger Paints India Ltd
    "BEL",             # Bharat Electronics Ltd
    "TORNTPHARM",      # Torrent Pharmaceuticals Ltd
    "MOTHERSON",       # Samvardhana Motherson International Ltd
    "LUPIN",           # Lupin Ltd
    "TATAPOWER",       # Tata Power Company Ltd
    "NAUKRI",          # Info Edge (India) Ltd
    "MARICO",          # Marico Ltd
    "CANBK",           # Canara Bank
    "BAJAJHLDNG",      # Bajaj Holdings & Investment Ltd
    "BANKBARODA",      # Bank of Baroda
    "UNIONBANK",       # Union Bank of India
    "PNB",             # Punjab National Bank
    "PGHH",            # Procter & Gamble Hygiene and Health Care Ltd
    "COLPAL",          # Colgate-Palmolive (India) Ltd
    "SBICARD",         # SBI Cards and Payment Services Ltd
    "YESBANK",         # Yes Bank Ltd
    "IDEA",            # Vodafone Idea Ltd
    "HINDZINC",        # Hindustan Zinc Ltd
    "PAGEIND",         # Page Industries Ltd
    "MAXHEALTH",       # Max Healthcare Institute Ltd
    "ICICIGI",         # ICICI Lombard General Insurance Company Ltd
    "IRCTC",           # Indian Railway Catering and Tourism Corporation Ltd
    "ABBOTINDIA",      # Abbott India Ltd
    "GLAXO",           # GlaxoSmithKline Pharmaceuticals Ltd
    "DMART",           # Avenue Supermarts Ltd
    "ALKEM",           # Alkem Laboratories Ltd
]

In [5]:
# Date Range Configuration
END_DATE = datetime.now().strftime('%Y-%m-%d')
START_DATE = (datetime.now() - timedelta(days=730)).strftime('%Y-%m-%d')  # 2 years

# Model Configuration
SEQUENCE_LENGTH = 60  # Number of days to look back
PREDICTION_HORIZON = 7  # Number of days to predict ahead
TRAIN_SPLIT = 0.7
VAL_SPLIT = 0.15
TEST_SPLIT = 0.15

# Directory Structure
DATA_DIR = "data"
RAW_DATA_DIR = f"{DATA_DIR}/raw"
PROCESSED_DATA_DIR = f"{DATA_DIR}/processed"
MODELS_DIR = "models"
STOCK_DATA_DIR = f"{PROCESSED_DATA_DIR}/stock_data"

# Create directories if they don't exist
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, STOCK_DATA_DIR]:
    os.makedirs(directory, exist_ok=True)

print(" Configuration complete!")
print(f"Data collection period: {START_DATE} to {END_DATE}")

 Configuration complete!
Data collection period: 2023-11-03 to 2025-11-02


## 2. Load Stock Lookup Data

In [6]:
# Load stock lookup data
STOCK_LOOKUP_PATH = os.path.join(PROCESSED_DATA_DIR, 'stock_lookup.json')

with open(STOCK_LOOKUP_PATH, 'r') as f:
    stock_lookup = json.load(f)

print(f"Loaded stock lookup data")
print(f"Total stocks in lookup: {len(stock_lookup['by_symbol'])}")
print(f"\nSample stock data (RELIANCE):")
if 'RELIANCE' in stock_lookup['by_symbol']:
    reliance_data = stock_lookup['by_symbol']['RELIANCE']
    print(json.dumps(reliance_data, indent=2))

Loaded stock lookup data
Total stocks in lookup: 2252

Sample stock data (RELIANCE):
{
  "segment": "NSE_EQ",
  "name": "RELIANCE INDUSTRIES LTD",
  "exchange": "NSE",
  "isin": "INE002A01018",
  "instrument_type": "EQ",
  "instrument_key": "NSE_EQ|INE002A01018",
  "lot_size": 1,
  "freeze_quantity": 100000.0,
  "exchange_token": "2885",
  "tick_size": 10.0,
  "trading_symbol": "RELIANCE",
  "short_name": "Reliance",
  "qty_multiplier": 1.0,
  "mtf_enabled": true,
  "mtf_bracket": 26.5,
  "security_type": "NORMAL"
}


In [7]:
# Map NIFTY 100 symbols to ISIN codes
nifty_100_mapping = {}
missing_stocks = []

for symbol in nifty_100_stocks:
    if symbol in stock_lookup['by_symbol']:
        stock_info = stock_lookup['by_symbol'][symbol]
        nifty_100_mapping[symbol] = {
            'isin': stock_info['isin'],
            'name': stock_info['name'],
            'trading_symbol': stock_info['trading_symbol'],
            'instrument_key': stock_info['instrument_key']
        }
    else:
        missing_stocks.append(symbol)

print(f"Successfully mapped {len(nifty_100_mapping)}/{len(nifty_100_stocks)} NIFTY 100 stocks")
print(f"\nSuccessfully mapped stocks: {len(nifty_100_mapping)}")

if missing_stocks:
    print(f"\n Missing stocks ({len(missing_stocks)}):")
    for stock in missing_stocks:
        print(f"  - {stock}")
else:
    print("\n All NIFTY 100 stocks found in lookup!")

# Display sample mappings
print("\n Sample mappings:")
for i, (symbol, info) in enumerate(list(nifty_100_mapping.items())[:5]):
    print(f"{symbol:15} -> {info['isin']:15} ({info['name']})")

Successfully mapped 98/100 NIFTY 100 stocks

Successfully mapped stocks: 98

 Missing stocks (2):
  - ZOMATO
  - MCDOWELL-N

 Sample mappings:
RELIANCE        -> INE002A01018    (RELIANCE INDUSTRIES LTD)
HDFCBANK        -> INE040A01034    (HDFC BANK LTD)
BHARTIARTL      -> INE397D01024    (BHARTI AIRTEL LIMITED)
TCS             -> INE467B01029    (TATA CONSULTANCY SERV LT)
ICICIBANK       -> INE090A01021    (ICICI BANK LTD.)


## 3. API Helper Functions

In [8]:
def create_session_with_retries(retries=3, backoff_factor=0.3):
    """
    Create a requests session with retry logic
    
    Args:
        retries: Number of retry attempts
        backoff_factor: Backoff factor for retries
    
    Returns:
        requests.Session object with retry configuration
    """
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=(500, 502, 504),
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

# Create global session
session = create_session_with_retries()
print(" HTTP session created with retry logic")

 HTTP session created with retry logic


In [9]:
def fetch_historical_data(isin: str, 
                          interval: str = 'days',
                          candle_interval: str = '1',
                          start_date: str = None,
                          end_date: str = None,
                          access_token: str = None) -> Optional[pd.DataFrame]:
    """
    Fetch historical stock data from Upstox API
    
    Args:
        isin: ISIN code of the stock (e.g., 'INE002A01018')
        interval: Time interval ('days', 'weeks', 'months')
        candle_interval: Candle interval ('1', '30', '60', etc.)
        start_date: Start date in YYYY-MM-DD format
        end_date: End date in YYYY-MM-DD format
        access_token: Upstox API access token
    
    Returns:
        DataFrame with columns: Date, Open, High, Low, Close, Volume, OI
        None if request fails
    """
    if access_token is None:
        access_token = UPSTOX_ACCESS_TOKEN
    
    if not access_token:
        print(" Error: UPSTOX_ACCESS_TOKEN not set")
        return None
    
    if start_date is None:
        start_date = START_DATE
    if end_date is None:
        end_date = END_DATE
    
    # Build URL: /v3/historical-candle/{instrument_key}/{interval}/{candle_interval}/{end_date}/{start_date}
    instrument_key = f"NSE_EQ%7C{isin}"  # URL encode the pipe character
    url = f"https://api.upstox.com/v3/historical-candle/{instrument_key}/{interval}/{candle_interval}/{end_date}/{start_date}"
    
    headers = {
        'Accept': 'application/json',
        'Authorization': f'Bearer {access_token}'
    }
    
    try:
        response = session.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            
            if data.get('status') == 'success' and 'data' in data:
                candles = data['data'].get('candles', [])
                
                if not candles:
                    print(f"  No data returned for ISIN: {isin}")
                    return None
                
                # Convert to DataFrame
                df = pd.DataFrame(candles, columns=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'OI'])
                
                # Convert date to datetime
                df['Date'] = pd.to_datetime(df['Date'])
                
                # Sort by date ascending
                df = df.sort_values('Date').reset_index(drop=True)
                
                return df
            else:
                print(f" API returned error for {isin}: {data.get('message', 'Unknown error')}")
                return None
        else:
            print(f" HTTP {response.status_code} for {isin}: {response.text[:200]}")
            return None
            
    except Exception as e:
        print(f" Exception while fetching {isin}: {str(e)}")
        return None

print(" fetch_historical_data() function defined")

 fetch_historical_data() function defined


## 4. Data Processing & Storage Functions

In [10]:
def add_technical_indicators(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add technical indicators to the dataframe
    
    Args:
        df: DataFrame with OHLCV data
    
    Returns:
        DataFrame with additional technical indicators
    """
    df = df.copy()
    
    # Moving Averages
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_10'] = df['Close'].rolling(window=10).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    df['SMA_50'] = df['Close'].rolling(window=50).mean()
    
    # Exponential Moving Averages
    df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
    df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()
    
    # MACD
    df['MACD'] = df['EMA_12'] - df['EMA_26']
    df['MACD_Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
    df['MACD_Hist'] = df['MACD'] - df['MACD_Signal']
    
    # RSI (Relative Strength Index)
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # Bollinger Bands
    df['BB_Middle'] = df['Close'].rolling(window=20).mean()
    bb_std = df['Close'].rolling(window=20).std()
    df['BB_Upper'] = df['BB_Middle'] + (bb_std * 2)
    df['BB_Lower'] = df['BB_Middle'] - (bb_std * 2)
    
    # Volume indicators
    df['Volume_SMA_20'] = df['Volume'].rolling(window=20).mean()
    df['Volume_Ratio'] = df['Volume'] / df['Volume_SMA_20']
    
    # Price change indicators
    df['Daily_Return'] = df['Close'].pct_change()
    df['Price_Range'] = df['High'] - df['Low']
    df['Price_Change'] = df['Close'] - df['Open']
    
    return df

print(" add_technical_indicators() function defined")

 add_technical_indicators() function defined


In [11]:
def save_stock_data(df: pd.DataFrame, symbol: str, stock_name: str) -> bool:
    """
    Save stock data to CSV file
    
    Args:
        df: DataFrame with stock data
        symbol: Stock symbol (e.g., 'RELIANCE')
        stock_name: Full stock name
    
    Returns:
        True if saved successfully, False otherwise
    """
    try:
        filename = f"{symbol}_historical.csv"
        filepath = os.path.join(STOCK_DATA_DIR, filename)
        
        # Save to CSV
        df.to_csv(filepath, index=False)
        
        return True
    except Exception as e:
        print(f" Error saving {symbol}: {str(e)}")
        return False

def load_stock_data(symbol: str) -> Optional[pd.DataFrame]:
    """
    Load stock data from CSV file
    
    Args:
        symbol: Stock symbol (e.g., 'RELIANCE')
    
    Returns:
        DataFrame with stock data, or None if file doesn't exist
    """
    try:
        filename = f"{symbol}_historical.csv"
        filepath = os.path.join(STOCK_DATA_DIR, filename)
        
        if os.path.exists(filepath):
            df = pd.read_csv(filepath)
            df['Date'] = pd.to_datetime(df['Date'])
            return df
        else:
            return None
    except Exception as e:
        print(f" Error loading {symbol}: {str(e)}")
        return None

print(" save_stock_data() and load_stock_data() functions defined")

 save_stock_data() and load_stock_data() functions defined


## 5. Collect Stock Data for NIFTY 100

In [12]:
from tqdm.notebook import tqdm
import time

# Track collection statistics
collection_stats = {
    'successful': [],
    'failed': [],
    'skipped': [],
    'total_records': 0
}

# Main data collection loop
print(f"Starting data collection for {len(nifty_100_mapping)} NIFTY 100 stocks")
print(f"Date range: {START_DATE} to {END_DATE}")
print("=" * 80)

for symbol, info in tqdm(nifty_100_mapping.items(), desc="Collecting stock data"):
    isin = info['isin']
    name = info['name']
    
    print(f"\n Processing: {symbol} ({name})")
    
    # Fetch historical data
    df = fetch_historical_data(
        isin=isin,
        interval='days',
        candle_interval='1',
        start_date=START_DATE,
        end_date=END_DATE,
        access_token=UPSTOX_ACCESS_TOKEN
    )
    
    if df is not None and len(df) > 0:
        print(f"   Fetched {len(df)} records")
        
        # Add technical indicators
        df = add_technical_indicators(df)
        print(f"   Added technical indicators")
        
        # Add metadata
        df['Symbol'] = symbol
        df['Name'] = name
        df['ISIN'] = isin
        
        # Save to CSV
        if save_stock_data(df, symbol, name):
            print(f"   Saved to {symbol}_historical.csv")
            collection_stats['successful'].append(symbol)
            collection_stats['total_records'] += len(df)
        else:
            collection_stats['failed'].append(symbol)
    else:
        print(f"   Failed to fetch data")
        collection_stats['failed'].append(symbol)
    
    # Rate limiting - be nice to the API
    time.sleep(0.5)

print("\n" + "=" * 80)
print("DATA COLLECTION COMPLETE!")
print("=" * 80)
print(f" Successful: {len(collection_stats['successful'])} stocks")
print(f" Failed: {len(collection_stats['failed'])} stocks")
print(f" Total records collected: {collection_stats['total_records']:,}")

if collection_stats['failed']:
    print(f"\n  Failed stocks:")
    for stock in collection_stats['failed']:
        print(f"   - {stock}")

Starting data collection for 98 NIFTY 100 stocks
Date range: 2023-11-03 to 2025-11-02


Collecting stock data:   0%|          | 0/98 [00:00<?, ?it/s]


 Processing: RELIANCE (RELIANCE INDUSTRIES LTD)
   Fetched 496 records
   Added technical indicators
   Saved to RELIANCE_historical.csv

 Processing: HDFCBANK (HDFC BANK LTD)
   Fetched 496 records
   Added technical indicators
   Saved to HDFCBANK_historical.csv

 Processing: BHARTIARTL (BHARTI AIRTEL LIMITED)
   Fetched 496 records
   Added technical indicators
   Saved to BHARTIARTL_historical.csv

 Processing: TCS (TATA CONSULTANCY SERV LT)
   Fetched 496 records
   Added technical indicators
   Saved to TCS_historical.csv

 Processing: ICICIBANK (ICICI BANK LTD.)
   Fetched 496 records
   Added technical indicators
   Saved to ICICIBANK_historical.csv

 Processing: SBIN (STATE BANK OF INDIA)
   Fetched 496 records
   Added technical indicators
   Saved to SBIN_historical.csv

 Processing: BAJFINANCE (BAJAJ FINANCE LIMITED)
   Fetched 496 records
   Added technical indicators
   Saved to BAJFINANCE_historical.csv

 Processing: INFY (INFOSYS LIMITED)
   Fetched 496 records
   Adde

## 6. Data Inspection & Summary

In [None]:
# List all collected stock data files
import glob

csv_files = glob.glob(os.path.join(STOCK_DATA_DIR, "*_historical.csv"))
print(f"Total CSV files: {len(csv_files)}")
print(f"\nSample files:")
for file in csv_files[:5]:
    filename = os.path.basename(file)
    filesize = os.path.getsize(file) / 1024  # KB
    print(f"  - {filename:<30} ({filesize:.2f} KB)")

In [None]:
# Load and inspect a sample stock (e.g., RELIANCE)
sample_symbol = 'RELIANCE'
sample_df = load_stock_data(sample_symbol)

if sample_df is not None:
    print(f"Stock: {sample_symbol}")
    print(f"Shape: {sample_df.shape}")
    print(f"Date range: {sample_df['Date'].min()} to {sample_df['Date'].max()}")
    print(f"\nColumns ({len(sample_df.columns)}):")
    print(sample_df.columns.tolist())
    print(f"\nFirst 5 rows:")
    display(sample_df.head())
    print(f"\nBasic statistics:")
    display(sample_df[['Open', 'High', 'Low', 'Close', 'Volume']].describe())
else:
    print(f" Could not load {sample_symbol}")

In [None]:
# Visualize sample stock data
if sample_df is not None:
    fig = make_subplots(
        rows=3, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.05,
        subplot_titles=(f'{sample_symbol} Price', 'Volume', 'RSI'),
        row_heights=[0.5, 0.25, 0.25]
    )
    
    # Candlestick chart
    fig.add_trace(
        go.Candlestick(
            x=sample_df['Date'],
            open=sample_df['Open'],
            high=sample_df['High'],
            low=sample_df['Low'],
            close=sample_df['Close'],
            name='Price'
        ),
        row=1, col=1
    )
    
    # Add moving averages
    fig.add_trace(
        go.Scatter(x=sample_df['Date'], y=sample_df['SMA_20'], 
                   name='SMA 20', line=dict(color='orange', width=1)),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(x=sample_df['Date'], y=sample_df['SMA_50'], 
                   name='SMA 50', line=dict(color='blue', width=1)),
        row=1, col=1
    )
    
    # Volume
    fig.add_trace(
        go.Bar(x=sample_df['Date'], y=sample_df['Volume'], name='Volume', 
               marker_color='lightblue'),
        row=2, col=1
    )
    
    # RSI
    fig.add_trace(
        go.Scatter(x=sample_df['Date'], y=sample_df['RSI'], 
                   name='RSI', line=dict(color='purple', width=1)),
        row=3, col=1
    )
    
    # Add RSI levels
    fig.add_hline(y=70, line_dash="dash", line_color="red", row=3, col=1)
    fig.add_hline(y=30, line_dash="dash", line_color="green", row=3, col=1)
    
    fig.update_layout(
        height=800,
        showlegend=True,
        xaxis_rangeslider_visible=False,
        title_text=f"{sample_symbol} - Technical Analysis"
    )
    
    fig.show()
else:
    print("No data available for visualization")

## 7. Create Combined Dataset for ML Training

In [None]:
# Combine all stock data into a single DataFrame (optional, for multi-stock analysis)
def create_combined_dataset(limit: int = None) -> pd.DataFrame:
    """
    Combine all stock data files into a single DataFrame
    
    Args:
        limit: Limit number of stocks to load (None for all)
    
    Returns:
        Combined DataFrame with all stocks
    """
    all_data = []
    
    csv_files = glob.glob(os.path.join(STOCK_DATA_DIR, "*_historical.csv"))
    
    if limit:
        csv_files = csv_files[:limit]
    
    print(f"Loading {len(csv_files)} stock files...")
    
    for file in tqdm(csv_files, desc="Loading files"):
        try:
            df = pd.read_csv(file)
            df['Date'] = pd.to_datetime(df['Date'])
            all_data.append(df)
        except Exception as e:
            print(f"Error loading {file}: {e}")
    
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"\n Combined dataset shape: {combined_df.shape}")
        print(f"   Date range: {combined_df['Date'].min()} to {combined_df['Date'].max()}")
        print(f"   Unique stocks: {combined_df['Symbol'].nunique()}")
        return combined_df
    else:
        print(" No data loaded")
        return None

# Create combined dataset (load first 10 stocks as example)
# combined_df = create_combined_dataset(limit=10)

print(" create_combined_dataset() function defined")
print("\nTo create combined dataset, run:")
print("  combined_df = create_combined_dataset()  # All stocks")
print("  combined_df = create_combined_dataset(limit=10)  # First 10 stocks")

In [None]:
# Save collection summary
summary_data = {
    'collection_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'date_range': {'start': START_DATE, 'end': END_DATE},
    'total_stocks_attempted': len(nifty_100_mapping),
    'successful_collections': len(collection_stats.get('successful', [])),
    'failed_collections': len(collection_stats.get('failed', [])),
    'total_records': collection_stats.get('total_records', 0),
    'successful_stocks': collection_stats.get('successful', []),
    'failed_stocks': collection_stats.get('failed', [])
}

summary_path = os.path.join(PROCESSED_DATA_DIR, 'collection_summary.json')
with open(summary_path, 'w') as f:
    json.dump(summary_data, f, indent=2)

print(f" Collection summary saved to: {summary_path}")
print("\nSummary:")
print(json.dumps({k: v for k, v in summary_data.items() if k not in ['successful_stocks', 'failed_stocks']}, indent=2))