# First - Data Exploration


In [1]:
import matplotlib
import seaborn
import pandas
import sklearn

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Load Data



In [3]:
# Load data using our data loader
from load_data import StockDataLoader

loader = StockDataLoader(data_dir="../data", train_file="train.csv")

# Load raw data
df = loader.load_data()
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()


Loading training data...
Reading from: ..\data\train.csv
✓ Loaded 21,033,522 rows
✓ Data loaded: 21,033,522 rows, 5000 unique tickers
  Date range: 1962-01-02 00:00:00 to 2024-09-23 00:00:00
Dataset shape: (21033522, 9)

Columns: ['Ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']


Unnamed: 0,Ticker,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,ticker_1,1962-01-02,0.0,0.265828,0.261788,0.261788,25600.0,0.0,0.0
1,ticker_1,1962-01-03,0.0,0.263808,0.261788,0.261788,28800.0,0.0,0.0
2,ticker_1,1962-01-04,0.0,0.263404,0.26098,0.26098,21600.0,0.0,0.0
3,ticker_1,1962-01-05,0.0,0.259364,0.255324,0.255324,46400.0,0.0,0.0
4,ticker_1,1962-01-08,0.0,0.259364,0.255728,0.256536,29600.0,0.0,0.0


## 2. Process Data and Create Targets


In [4]:
# Process data: handle missing values, create targets, split
datasets = loader.process(train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)

train_df = datasets['train']
val_df = datasets['val']
test_df = datasets['test']

print(f"\nTrain set: {len(train_df):,} rows")
print(f"Val set: {len(val_df):,} rows")
print(f"Test set: {len(test_df):,} rows")

# Check target distribution
print("\nTarget Distribution (Train):")
target_dist = train_df['target'].value_counts()
print(f"↑ (Higher): {target_dist.get(1, 0):,} ({target_dist.get(1, 0)/len(train_df)*100:.2f}%)")
print(f"↓ (Lower): {target_dist.get(0, 0):,} ({target_dist.get(0, 0)/len(train_df)*100:.2f}%)")


STOCK DATA LOADING AND PREPROCESSING
Loading training data...
Reading from: ..\data\train.csv
✓ Loaded 21,033,522 rows
✓ Data loaded: 21,033,522 rows, 5000 unique tickers
  Date range: 1962-01-02 00:00:00 to 2024-09-23 00:00:00

Handling missing values...
✓ No missing values found
  Removing 807,778 rows with invalid prices
✓ Removed 807,778 rows with invalid data
✓ Final dataset: 20,225,744 rows, 5000 unique tickers

Creating target labels (prediction horizon: 30 trading days)...
✓ Created target labels
  Removed 150,000 rows without future prices (last 30 days per ticker)
  Final dataset: 20,075,744 rows
  Target distribution:
    ↑ (Higher): 10,848,766 (54.04%)
    ↓ (Lower): 9,226,978 (45.96%)

Splitting data into train/validation/test sets...
  Ratios: Train=70.0%, Val=15.0%, Test=15.0%
✓ Data split completed:
  Train: 5,505,350 rows (27.42%)
    Date range: 1962-01-02 00:00:00 to 2005-10-25 00:00:00
  Validation: 5,475,700 rows (27.28%)
    Date range: 2005-10-26 00:00:00 to 2015