In [3]:
# Run this cell first to install required packages
# Uncomment the lines below if packages are not installed

!pip install pandas numpy requests matplotlib seaborn scipy
!pip install yfinance 
!pip install ta-lib  

print("✅ Package installation cell ready!")
print("Uncomment the pip install lines above if you need to install packages")

Collecting ta-lib✅ Package installation cell ready!
Uncomment the pip install lines above if you need to install packages

  Downloading ta_lib-0.6.5-cp313-cp313-win_amd64.whl.metadata (24 kB)
Collecting build (from ta-lib)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pyproject_hooks (from build->ta-lib)
  Downloading pyproject_hooks-1.2.0-py3-none-any.whl.metadata (1.3 kB)
Downloading ta_lib-0.6.5-cp313-cp313-win_amd64.whl (882 kB)
   ---------------------------------------- 0.0/882.8 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/882.8 kB ? eta -:--:--
   ----------------------- ---------------- 524.3/882.8 kB 2.2 MB/s eta 0:00:01
   ---------------------------------------- 882.8/882.8 kB 1.9 MB/s eta 0:00:00
Downloading build-1.3.0-py3-none-any.whl (23 kB)
Downloading pyproject_hooks-1.2.0-py3-none-any.whl (10 kB)
Installing collected packages: pyproject_hooks, build, ta-lib

   ---------------------------------------- 3/3 [ta-lib]

Suc

In [8]:
import pandas as pd
import numpy as np
import requests
import zipfile
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# For real data
try:
    import yfinance as yf
    YFINANCE_AVAILABLE = True
    print("✅ yfinance available for real data")
except ImportError:
    YFINANCE_AVAILABLE = False
    print("⚠️ yfinance not available, will use sample data")

# For plotting and analysis
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Get current date for data range
TODAY = datetime.now().strftime('%Y-%m-%d')
START_DATE = "2020-01-01"

print(f"✅ All libraries imported successfully!")
print(f"📅 Data range: {START_DATE} to {TODAY}")

✅ yfinance available for real data
✅ All libraries imported successfully!
📅 Data range: 2020-01-01 to 2025-08-09


In [9]:
# Configuration - Update this path to match your system
SYMBOL = "GBPJPY"
DATA_DIR = "C:/Users/enjas/Desktop/Tradingnew/data"
RAW_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")

# Create directories
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Data range configuration
START_DATE = "2020-01-01"
END_DATE = datetime.now().strftime('%Y-%m-%d')

print(f"✅ Data directories created:")
print(f"   Raw data: {RAW_DIR}")
print(f"   Processed data: {PROCESSED_DIR}")
print(f"   Working symbol: {SYMBOL}")
print(f"📅 Data range: {START_DATE} to {END_DATE}")
print(f"📊 Total days: {(datetime.now() - datetime.strptime(START_DATE, '%Y-%m-%d')).days}")

✅ Data directories created:
   Raw data: C:/Users/enjas/Desktop/Tradingnew/data\raw
   Processed data: C:/Users/enjas/Desktop/Tradingnew/data\processed
   Working symbol: GBPJPY
📅 Data range: 2020-01-01 to 2025-08-09
📊 Total days: 2047


In [10]:
def download_real_forex_data(symbol="GBPJPY=X", start_date="2020-01-01", end_date=None):
    """Download real forex data using yfinance"""
    if not YFINANCE_AVAILABLE:
        return None
    
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    try:
        print(f"📡 Downloading real {symbol} data from Yahoo Finance...")
        print(f"📅 Date range: {start_date} to {end_date}")
        
        # Download daily data first
        ticker = yf.Ticker(symbol)
        data = ticker.history(start=start_date, end=end_date, interval="1d")
        
        if data.empty:
            print("❌ No data received from Yahoo Finance")
            return None
        
        # Clean column names and add volume/spread
        data.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
        
        # Add realistic spread data (GBP/JPY typical spread 2-5 pips)
        np.random.seed(42)
        data['Extra_1'] = np.random.uniform(2.0, 5.0, len(data))
        
        # Ensure proper OHLC logic
        data = data[data['High'] >= data[['Open', 'Close']].max(axis=1)]
        data = data[data['Low'] <= data[['Open', 'Close']].min(axis=1)]
        
        print(f"✅ Downloaded {len(data)} days of real data")
        print(f"📊 Date range: {data.index.min()} to {data.index.max()}")
        
        return data
        
    except Exception as e:
        print(f"❌ Error downloading real data: {e}")
        return None

def create_sample_data(timeframe, start_date="2020-01-01", end_date=None):
    """Create realistic sample GBPJPY data until today"""
    
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    # Frequency mapping
    freq_map = {
        "15m": "15T", "30m": "30T", "1h": "1H", 
        "4h": "4H", "1d": "1D"
    }
    
    # Generate date range
    date_range = pd.date_range(
        start=start_date, end=end_date, 
        freq=freq_map.get(timeframe, "1H")
    )
    
    print(f"   📊 Creating {len(date_range)} {timeframe} candles from {start_date} to {end_date}")
    
    # Generate realistic GBPJPY price movements based on historical patterns
    np.random.seed(42)  # For reproducibility
    
    # Use realistic base price and volatility patterns
    base_price = 150.0  # Starting around current levels
    n_periods = len(date_range)
    
    # Create more realistic price series with:
    # 1. Long-term trends
    # 2. Medium-term cycles  
    # 3. Short-term volatility
    # 4. Regime changes
    
    # Long-term trend (5-year cycle)
    long_trend = np.sin(np.linspace(0, 4*np.pi, n_periods)) * 0.1
    
    # Medium-term cycles (quarterly/monthly)
    medium_cycle = np.sin(np.linspace(0, 20*np.pi, n_periods)) * 0.05
    
    # Volatility clustering (GARCH-like)
    volatility = np.zeros(n_periods)
    volatility[0] = 0.002
    for i in range(1, n_periods):
        volatility[i] = 0.0001 + 0.85 * volatility[i-1] + 0.1 * np.random.normal(0, 0.0005)**2
    
    # Generate returns with varying volatility
    returns = []
    for i in range(n_periods):
        vol = max(volatility[i], 0.0005)  # Minimum volatility
        trend_component = long_trend[i] + medium_cycle[i]
        noise = np.random.normal(0, vol)
        returns.append(trend_component + noise)
    
    # Create price series
    log_prices = np.cumsum(returns)
    prices = base_price * np.exp(log_prices)
    
    # Generate OHLC data with realistic intrabar movements
    data = []
    for i, price in enumerate(prices):
        if i == 0:
            open_price = price
        else:
            open_price = data[i-1]['Close']
        
        # Use current volatility for intrabar range
        current_vol = volatility[i] if i < len(volatility) else 0.002
        range_size = open_price * current_vol * np.random.uniform(0.5, 2.0)
        
        # Generate high/low with bias towards direction
        direction_bias = (price - open_price) / open_price if open_price != 0 else 0
        
        if direction_bias > 0:  # Bullish bias
            high = open_price + range_size * np.random.uniform(0.6, 1.0)
            low = open_price - range_size * np.random.uniform(0.2, 0.6)
        else:  # Bearish bias
            high = open_price + range_size * np.random.uniform(0.2, 0.6)
            low = open_price - range_size * np.random.uniform(0.6, 1.0)
        
        close = price
        
        # Ensure OHLC logic
        high = max(open_price, high, close, low + 0.001)
        low = min(open_price, low, close, high - 0.001)
        
        # Volume patterns (higher during certain hours, news events)
        hour = date_range[i].hour if hasattr(date_range[i], 'hour') else 12
        
        # Volume based on session (London/NY overlap = higher volume)
        if 13 <= hour <= 17:  # London/NY overlap
            base_volume = np.random.randint(3000, 12000)
        elif 8 <= hour <= 17:  # London session
            base_volume = np.random.randint(2000, 8000)
        elif 21 <= hour or hour <= 6:  # Asia session
            base_volume = np.random.randint(1000, 5000)
        else:  # Off hours
            base_volume = np.random.randint(500, 2000)
        
        # Add volatility-based volume boost
        vol_multiplier = 1 + (current_vol / 0.002)
        volume = int(base_volume * vol_multiplier)
        
        # Realistic GBP/JPY spread (2-6 pips, wider during low liquidity)
        if 22 <= hour or hour <= 6:  # Low liquidity hours
            spread = np.random.uniform(3.5, 6.0)
        else:  # Normal hours
            spread = np.random.uniform(2.0, 4.5)
        
        data.append({
            'Open': round(open_price, 3),
            'High': round(high, 3),
            'Low': round(low, 3),
            'Close': round(close, 3),
            'Volume': volume,
            'Extra_1': round(spread, 1)
        })
    
    df = pd.DataFrame(data, index=date_range)
    
    print(f"   ✅ Generated realistic data: {len(df)} rows")
    print(f"   📈 Price range: {df['Close'].min():.3f} - {df['Close'].max():.3f}")
    print(f"   📊 Avg volume: {df['Volume'].mean():.0f}")
    print(f"   💰 Avg spread: {df['Extra_1'].mean():.2f} pips")
    
    return df

def download_forex_data(timeframes=["15m", "30m", "1h"]):
    """Download or create forex data with current date range"""
    print("🚀 Starting data acquisition...")
    print(f"📅 Target date range: {START_DATE} to {END_DATE}")
    
    downloaded_data = {}
    
    # Try to get real daily data first
    real_daily_data = download_real_forex_data("GBPJPY=X", START_DATE, END_DATE)
    
    for tf in timeframes:
        print(f"\n📊 Processing {tf} timeframe...")
        
        if real_daily_data is not None and tf == "1d":
            # Use real daily data
            downloaded_data[tf] = real_daily_data
            print(f"   ✅ Using real daily data: {len(real_daily_data)} rows")
        else:
            # Create sample data for intraday timeframes
            df = create_sample_data(tf, START_DATE, END_DATE)
            downloaded_data[tf] = df
    
    print(f"\n✅ Data acquisition completed for {len(timeframes)} timeframes!")
    return downloaded_data

# Execute data download/creation
print("🚀 Starting enhanced data acquisition process...")
timeframe_data = download_forex_data(["15m", "30m", "1h"])
print("✅ Data acquisition completed!")

# Display summary
for tf, df in timeframe_data.items():
    print(f"📊 {tf}: {len(df)} rows from {df.index.min()} to {df.index.max()}")

🚀 Starting enhanced data acquisition process...
🚀 Starting data acquisition...
📅 Target date range: 2020-01-01 to 2025-08-09
📡 Downloading real GBPJPY=X data from Yahoo Finance...
📅 Date range: 2020-01-01 to 2025-08-09
❌ Error downloading real data: Length mismatch: Expected axis has 7 elements, new values have 5 elements

📊 Processing 15m timeframe...
   📊 Creating 196513 15m candles from 2020-01-01 to 2025-08-09
   ✅ Generated realistic data: 196513 rows
   📈 Price range: 134.250 - inf
   📊 Avg volume: 5417
   💰 Avg spread: 3.81 pips

📊 Processing 30m timeframe...
   📊 Creating 98257 30m candles from 2020-01-01 to 2025-08-09
   ✅ Generated realistic data: 98257 rows
   📈 Price range: 149.697 - inf
   📊 Avg volume: 5410
   💰 Avg spread: 3.81 pips

📊 Processing 1h timeframe...
   📊 Creating 49129 1h candles from 2020-01-01 to 2025-08-09
   ✅ Generated realistic data: 49129 rows
   📈 Price range: 149.054 - inf
   📊 Avg volume: 5425
   💰 Avg spread: 3.81 pips

✅ Data acquisition complete

In [7]:
def create_sample_data(timeframe, start_date="2021-01-01", end_date="2024-12-31"):
    """Create realistic sample GBPJPY data"""
    
    # Frequency mapping
    freq_map = {
        "15m": "15T", "30m": "30T", "1h": "1H", 
        "4h": "4H", "1d": "1D"
    }
    
    # Generate date range
    date_range = pd.date_range(
        start=start_date, end=end_date, 
        freq=freq_map.get(timeframe, "1H")
    )
    
    print(f"   Creating {len(date_range)} {timeframe} candles...")
    
    # Generate realistic GBPJPY price movements
    np.random.seed(42)  # For reproducibility
    base_price = 152.0
    n_periods = len(date_range)
    
    # Create trending + mean-reverting price series
    trend = np.cumsum(np.random.normal(0, 0.0001, n_periods))
    noise = np.random.normal(0, 0.002, n_periods)
    log_returns = trend + noise
    
    prices = base_price * np.exp(np.cumsum(log_returns))
    
    # Generate OHLC data
    data = []
    for i, price in enumerate(prices):
        if i == 0:
            open_price = price
        else:
            open_price = data[i-1]['Close']
        
        # Generate realistic intrabar volatility
        volatility = np.random.uniform(0.0005, 0.003)
        range_size = open_price * volatility
        
        high = open_price + np.random.uniform(0, range_size)
        low = open_price - np.random.uniform(0, range_size)
        close = np.random.uniform(low, high)
        
        # Ensure OHLC logic
        high = max(open_price, high, close)
        low = min(open_price, low, close)
        
        volume = np.random.randint(1000, 8000)
        spread = np.random.uniform(1.5, 4.5)  # Typical GBP/JPY spread
        
        data.append({
            'Open': round(open_price, 3),
            'High': round(max(open_price, high), 3),
            'Low': round(min(open_price, low), 3), 
            'Close': round(close, 3),
            'Volume': volume,
            'Extra_1': round(spread, 1)
        })
    
    df = pd.DataFrame(data, index=date_range)
    return df

def download_forex_data(timeframes=["15m", "30m", "1h"]):
    """Download or create forex data"""
    print("🔄 Starting data acquisition...")
    
    downloaded_data = {}
    
    for tf in timeframes:
        print(f"📊 Processing {tf} timeframe...")
        
        # For this example, we'll create sample data
        # You can replace this with actual download logic if you have access to data sources
        try:
            df = create_sample_data(tf)
            downloaded_data[tf] = df
            print(f"   ✅ Created sample data for {tf}: {len(df)} rows")
        except Exception as e:
            print(f"   ❌ Error creating {tf} data: {e}")
    
    return downloaded_data

# Execute data download/creation
print("🚀 Starting data acquisition process...")
timeframe_data = download_forex_data(["15m", "30m", "1h"])
print("✅ Data acquisition completed!")

🚀 Starting data acquisition process...
🔄 Starting data acquisition...
📊 Processing 15m timeframe...
   Creating 140161 15m candles...
   ✅ Created sample data for 15m: 140161 rows
📊 Processing 30m timeframe...
   Creating 70081 30m candles...
   ✅ Created sample data for 30m: 70081 rows
📊 Processing 1h timeframe...
   Creating 35041 1h candles...
   ✅ Created sample data for 1h: 35041 rows
✅ Data acquisition completed!
