<a href="https://colab.research.google.com/github/Legajo/Colab-Notebooks/blob/main/HW3_Q4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
COMPLETE SOLUTION: Question 3 - Unique Correct Predictions from Decision Tree
HW3 - Stock Prediction Analysis

This is a complete standalone solution that includes:
1. Data loading from Google Drive
2. Data preprocessing
3. Creating pred0-pred4 (if not already done)
4. Training Decision Tree and creating pred5_clf_10
5. Counting unique correct predictions on TEST set
"""

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("QUESTION 3: DECISION TREE UNIQUE PREDICTIONS ANALYSIS")
print("="*70)

# ============================================================================
# STEP 0: LOAD DATA FROM GOOGLE DRIVE
# ============================================================================

print("\nSTEP 0: Loading Data from Google Drive")
print("="*70)

# Mount Google Drive
print("\n✓ Mounting Google Drive...")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✓ Google Drive mounted")
except:
    print("⚠️  Not in Colab environment or Drive already mounted")

# Install required packages
print("\n✓ Installing packages...")
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "gdown", "-q"])

# Check if file exists, if not download it
import os

# Try Google Drive first (permanent storage)
drive_path = "/content/drive/MyDrive/Colab Notebooks/stocks_df_combined_2025_06_13.parquet.brotli"
content_path = "/content/stocks_df_combined_2025_06_13.parquet.brotli"

if os.path.exists(drive_path):
    print(f"\n✓ File found in Google Drive")
    file_path = drive_path
elif os.path.exists(content_path):
    print(f"\n✓ File found in /content/")
    file_path = content_path
else:
    print(f"\n⚠️  File not found. Downloading to /content/...")
    import gdown
    # Download using the file ID from the Google Drive link
    file_id = "12yZljSdCu1rD7pnJ3Mf1BvHzw8oWy7Xr"
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, content_path, quiet=False)
    file_path = content_path

    if os.path.exists(content_path):
        file_size = os.path.getsize(content_path) / (1024*1024)
        print(f"✓ File downloaded successfully! Size: {file_size:.2f} MB")
    else:
        raise FileNotFoundError("❌ Download failed! Please check your internet connection.")

print(f"\n✓ Using file: {file_path}")

# Load the parquet file
print(f"\nLoading data from: {file_path}")

try:
    df = pd.read_parquet(file_path)
    print(f"✓ Data loaded successfully!")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {len(df.columns)}")
except Exception as e:
    print(f"❌ Error loading file: {e}")
    raise

print(f"\nFirst few columns: {list(df.columns[:10])}")
print(f"\nDataframe info:")
print(df.info())




QUESTION 3: DECISION TREE UNIQUE PREDICTIONS ANALYSIS

STEP 0: Loading Data from Google Drive

✓ Mounting Google Drive...
Mounted at /content/drive
✓ Google Drive mounted

✓ Installing packages...

✓ File found in Google Drive

✓ Using file: /content/drive/MyDrive/Colab Notebooks/stocks_df_combined_2025_06_13.parquet.brotli

Loading data from: /content/drive/MyDrive/Colab Notebooks/stocks_df_combined_2025_06_13.parquet.brotli
✓ Data loaded successfully!
  Shape: (230262, 203)
  Columns: 203

First few columns: ['Open', 'High', 'Low', 'Close_x', 'Volume', 'Dividends', 'Stock Splits', 'Ticker', 'Year', 'Month']

Dataframe info:
<class 'pandas.core.frame.DataFrame'>
Index: 230262 entries, 0 to 5700
Columns: 203 entries, Open to growth_btc_usd_365d
dtypes: datetime64[ns](3), float64(129), int32(64), int64(5), object(2)
memory usage: 302.2+ MB
None


In [3]:

# ============================================================================
# CONFIGURATION: Identify key columns
# ============================================================================

print("\n" + "="*70)
print("CONFIGURATION: Identifying Key Columns")
print("="*70)

# Look for target column (common names for stock prediction)
possible_target_cols = ['is_positive_growth_30d_future', 'label', 'target', 'y',
                        'direction', 'return_direction', 'price_direction', 'up_down', 'class']
TARGET_COL = None

for col in possible_target_cols:
    if col in df.columns:
        TARGET_COL = col
        break

if TARGET_COL is None:
    # Try to find any column with 'future' or 'label' or 'target' in name
    target_candidates = [col for col in df.columns if any(keyword in col.lower()
                        for keyword in ['future', 'label', 'target', 'positive_growth'])]
    if target_candidates:
        TARGET_COL = target_candidates[0]
        print(f"⚠️  Auto-selected target column: {TARGET_COL}")
    else:
        print("❌ Could not automatically detect target column!")
        print(f"Available columns: {df.columns.tolist()}")
        TARGET_COL = input("Please enter the target column name: ")

print(f"✓ Target column identified: {TARGET_COL}")

# Look for split column
possible_split_cols = ['split', 'dataset', 'data_split', 'set_type']
SPLIT_COL = None

for col in possible_split_cols:
    if col in df.columns:
        SPLIT_COL = col
        break

if SPLIT_COL:
    print(f"✓ Split column found: {SPLIT_COL}")
    print(f"  Split values: {df[SPLIT_COL].unique()}")
else:
    print("⚠️  No split column found. Will create train/val/test split...")
    # Create train/val/test split
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    df['split'] = 'train'
    df.loc[val_df.index, 'split'] = 'val'
    df.loc[test_df.index, 'split'] = 'test'

    SPLIT_COL = 'split'
    print(f"✓ Created split column: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")

# ============================================================================
# CHECK FOR EXISTING PREDICTIONS (pred0-pred4) FROM QUESTION 2
# ============================================================================

print("\n" + "="*70)
print("CHECKING FOR EXISTING PREDICTION COLUMNS FROM QUESTION 2")
print("="*70)

pred_cols = ['pred0_manual_cci', 'pred1_manual_prev_g1', 'pred2_manual_prev_g1_and_snp',
             'pred3_manual_dgs10_5', 'pred4_manual_dgs10_fedfunds']
missing_preds = [col for col in pred_cols if col not in df.columns]

if missing_preds:
    print(f"⚠️  Missing prediction columns: {missing_preds}")
    print("\n📋 Creating predictions from Question 2 rules...")

    # Create pred0-pred4 using the exact rules from Question 2
    print("\nCreating pred0_manual_cci: (cci > 200)")
    df['pred0_manual_cci'] = (df['cci'] > 200).astype(int)
    print(f"  ✓ pred0: {df['pred0_manual_cci'].sum()} positive predictions")

    print("\nCreating pred1_manual_prev_g1: (growth_30d > 1)")
    df['pred1_manual_prev_g1'] = (df['growth_30d'] > 1).astype(int)
    print(f"  ✓ pred1: {df['pred1_manual_prev_g1'].sum()} positive predictions")

    print("\nCreating pred2_manual_prev_g1_and_snp: (growth_30d > 1) & (growth_snp500_30d > 1)")
    df['pred2_manual_prev_g1_and_snp'] = ((df['growth_30d'] > 1) & (df['growth_snp500_30d'] > 1)).astype(int)
    print(f"  ✓ pred2: {df['pred2_manual_prev_g1_and_snp'].sum()} positive predictions")

    print("\nCreating pred3_manual_dgs10_5: (DGS10 <= 4) & (DGS5 <= 1)")
    df['pred3_manual_dgs10_5'] = ((df['DGS10'] <= 4) & (df['DGS5'] <= 1)).astype(int)
    print(f"  ✓ pred3: {df['pred3_manual_dgs10_5'].sum()} positive predictions")

    print("\nCreating pred4_manual_dgs10_fedfunds: (DGS10 > 4) & (FEDFUNDS <= 4.795)")
    df['pred4_manual_dgs10_fedfunds'] = ((df['DGS10'] > 4) & (df['FEDFUNDS'] <= 4.795)).astype(int)
    print(f"  ✓ pred4: {df['pred4_manual_dgs10_fedfunds'].sum()} positive predictions")

    print("\n✓ All Question 2 predictions created successfully!")
else:
    print(f"✓ All prediction columns found: {pred_cols}")

# Rename to simpler pred0-pred4 for convenience in later steps
df['pred0'] = df['pred0_manual_cci']
df['pred1'] = df['pred1_manual_prev_g1']
df['pred2'] = df['pred2_manual_prev_g1_and_snp']
df['pred3'] = df['pred3_manual_dgs10_5']
df['pred4'] = df['pred4_manual_dgs10_fedfunds']

print("\n✓ Created simplified column names (pred0-pred4) for analysis")






CONFIGURATION: Identifying Key Columns
✓ Target column identified: is_positive_growth_30d_future
⚠️  No split column found. Will create train/val/test split...
✓ Created split column: train=161183, val=34539, test=34540

CHECKING FOR EXISTING PREDICTION COLUMNS FROM QUESTION 2
⚠️  Missing prediction columns: ['pred0_manual_cci', 'pred1_manual_prev_g1', 'pred2_manual_prev_g1_and_snp', 'pred3_manual_dgs10_5', 'pred4_manual_dgs10_fedfunds']

📋 Creating predictions from Question 2 rules...

Creating pred0_manual_cci: (cci > 200)
  ✓ pred0: 6330 positive predictions

Creating pred1_manual_prev_g1: (growth_30d > 1)
  ✓ pred1: 136985 positive predictions

Creating pred2_manual_prev_g1_and_snp: (growth_30d > 1) & (growth_snp500_30d > 1)
  ✓ pred2: 104969 positive predictions

Creating pred3_manual_dgs10_5: (DGS10 <= 4) & (DGS5 <= 1)
  ✓ pred3: 26238 positive predictions

Creating pred4_manual_dgs10_fedfunds: (DGS10 > 4) & (FEDFUNDS <= 4.795)
  ✓ pred4: 43741 positive predictions

✓ All Questi

In [4]:
# ============================================================================
# STEP 1: PREPARE DATA AND TRAIN DECISION TREE
# ============================================================================

print("\n" + "="*70)
print("STEP 1: TRAIN DECISION TREE AND GENERATE PREDICTIONS")
print("="*70)

# Identify feature columns
exclude_cols = [TARGET_COL, SPLIT_COL] + pred_cols + [col for col in df.columns if 'Unnamed' in col]
# Also exclude Date and any other datetime columns
exclude_cols += ['Date', 'Year', 'Quarter']
feature_cols = [col for col in df.columns if col not in exclude_cols]

# Additional filter: remove any datetime columns
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
feature_cols = [col for col in feature_cols if col not in datetime_cols]

print(f"\nFeature columns: {len(feature_cols)}")
print(f"Sample features: {feature_cols[:5]}")
print(f"Excluded columns: {len(exclude_cols)}")

# Create masks for splits
split_values = df[SPLIT_COL].unique()
print(f"\nSplit values in data: {split_values}")

# Flexible split value matching
train_mask = df[SPLIT_COL].str.lower().isin(['train', 'training']) if df[SPLIT_COL].dtype == 'object' else df[SPLIT_COL] == 'train'
val_mask = df[SPLIT_COL].str.lower().isin(['val', 'validation', 'valid']) if df[SPLIT_COL].dtype == 'object' else df[SPLIT_COL] == 'val'
test_mask = df[SPLIT_COL].str.lower().isin(['test', 'testing']) if df[SPLIT_COL].dtype == 'object' else df[SPLIT_COL] == 'test'

print(f"\nDataset sizes:")
print(f"  Train: {train_mask.sum()}")
print(f"  Val: {val_mask.sum()}")
print(f"  Test: {test_mask.sum()}")

# Combine train and validation
train_val_mask = train_mask | val_mask
print(f"  Train+Val: {train_val_mask.sum()}")

# Prepare features and target
X_train_val = df.loc[train_val_mask, feature_cols].copy()
y_train_val = df.loc[train_val_mask, TARGET_COL].copy()

X_all = df[feature_cols].copy()
y_all = df[TARGET_COL].copy()

print(f"\nInitial shapes:")
print(f"  X_train_val: {X_train_val.shape}")
print(f"  X_all: {X_all.shape}")

# Handle non-numeric columns
print("\nPreprocessing features...")

# Check for any remaining datetime columns
datetime_cols_in_X = X_all.select_dtypes(include=['datetime64']).columns.tolist()
if datetime_cols_in_X:
    print(f"  Removing datetime columns: {datetime_cols_in_X}")
    X_train_val = X_train_val.drop(columns=datetime_cols_in_X)
    X_all = X_all.drop(columns=datetime_cols_in_X)

# Convert categorical columns
for col in X_all.columns:
    if X_all[col].dtype == 'object':
        print(f"  Converting categorical column: {col}")
        X_all[col] = pd.Categorical(X_all[col]).codes
        X_train_val[col] = pd.Categorical(X_train_val[col]).codes

print(f"\nFinal shapes after preprocessing:")
print(f"  X_train_val: {X_train_val.shape}")
print(f"  X_all: {X_all.shape}")

# Handle missing values
if X_train_val.isnull().any().any():
    print("  Handling missing values...")
    X_train_val = X_train_val.fillna(X_train_val.median())
    X_all = X_all.fillna(X_all.median())

# Handle infinite values
X_train_val = X_train_val.replace([np.inf, -np.inf], np.nan).fillna(0)
X_all = X_all.replace([np.inf, -np.inf], np.nan).fillna(0)

print("✓ Preprocessing complete")

# Train Decision Tree
print("\nTraining Decision Tree Classifier...")
print("  Parameters: max_depth=10, random_state=42")

clf_10 = DecisionTreeClassifier(max_depth=10, random_state=42)
clf_10.fit(X_train_val, y_train_val)

print("✓ Model trained successfully!")

# Generate predictions
print("\nGenerating predictions on entire dataset...")
predictions_clf_10 = clf_10.predict(X_all)
df['pred5_clf_10'] = predictions_clf_10

print("✓ Predictions stored in 'pred5_clf_10' column")

# Calculate accuracies
train_val_acc = (df.loc[train_val_mask, 'pred5_clf_10'] == df.loc[train_val_mask, TARGET_COL]).mean()
test_acc = (df.loc[test_mask, 'pred5_clf_10'] == df.loc[test_mask, TARGET_COL]).mean()

print(f"\nModel Performance:")
print(f"  Train+Val Accuracy: {train_val_acc:.4f} ({train_val_acc*100:.2f}%)")
print(f"  Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")



STEP 1: TRAIN DECISION TREE AND GENERATE PREDICTIONS

Feature columns: 203
Sample features: ['Open', 'High', 'Low', 'Close_x', 'Volume']
Excluded columns: 10

Split values in data: ['test' 'val' 'train']

Dataset sizes:
  Train: 4920
  Val: 8450
  Test: 216892
  Train+Val: 13370

Initial shapes:
  X_train_val: (13370, 203)
  X_all: (230262, 203)

Preprocessing features...
  Converting categorical column: Ticker
  Converting categorical column: ticker_type

Final shapes after preprocessing:
  X_train_val: (13370, 203)
  X_all: (230262, 203)
  Handling missing values...
✓ Preprocessing complete

Training Decision Tree Classifier...
  Parameters: max_depth=10, random_state=42
✓ Model trained successfully!

Generating predictions on entire dataset...
✓ Predictions stored in 'pred5_clf_10' column

Model Performance:
  Train+Val Accuracy: 1.0000 (100.00%)
  Test Accuracy: 0.9996 (99.96%)


In [5]:

# ============================================================================
# STEP 2: IDENTIFY UNIQUE CORRECT PREDICTIONS
# ============================================================================

print("\n" + "="*70)
print("STEP 2: IDENTIFY UNIQUE CORRECT PREDICTIONS")
print("="*70)

# Create correctness indicators
df['pred5_correct'] = (df['pred5_clf_10'] == df[TARGET_COL])
df['pred0_incorrect'] = (df['pred0'] != df[TARGET_COL])
df['pred1_incorrect'] = (df['pred1'] != df[TARGET_COL])
df['pred2_incorrect'] = (df['pred2'] != df[TARGET_COL])
df['pred3_incorrect'] = (df['pred3'] != df[TARGET_COL])
df['pred4_incorrect'] = (df['pred4'] != df[TARGET_COL])

print("✓ Created correctness indicator columns")

# Create unique correctness column
df['only_pred5_is_correct'] = (
    df['pred5_correct'] &
    df['pred0_incorrect'] &
    df['pred1_incorrect'] &
    df['pred2_incorrect'] &
    df['pred3_incorrect'] &
    df['pred4_incorrect']
)

print(f"✓ Created 'only_pred5_is_correct' column")
print(f"  Total records (all data) where ONLY pred5 is correct: {df['only_pred5_is_correct'].sum()}")

# ============================================================================
# STEP 3: COUNT UNIQUE CORRECT PREDICTIONS ON TEST SET
# ============================================================================

print("\n" + "="*70)
print("STEP 3: COUNT UNIQUE CORRECT PREDICTIONS ON TEST SET")
print("="*70)

# Convert to integer
df['only_pred5_is_correct_int'] = df['only_pred5_is_correct'].astype(int)

# Filter to TEST set
df_test = df[test_mask].copy()
print(f"\n✓ Filtered to TEST dataset: {len(df_test)} records")

# Count unique correct predictions
unique_correct_count = df_test['only_pred5_is_correct_int'].sum()

# ============================================================================
# DISPLAY FINAL ANSWER
# ============================================================================

print("\n" + "🎯"*35)
print("║" + " "*68 + "║")
print("║" + " "*20 + "FINAL ANSWER - QUESTION 3" + " "*23 + "║")
print("║" + " "*68 + "║")
print("║  Number of TEST records where pred5_clf_10 is uniquely correct:  ║")
print("║  (correct while all pred0-pred4 are incorrect)                   ║")
print("║" + " "*68 + "║")
print(f"║{unique_correct_count:^68}║")
print("║" + " "*68 + "║")
print("🎯"*35)

# ============================================================================
# ADDITIONAL ANALYSIS
# ============================================================================

print("\n" + "="*70)
print("ADDITIONAL ANALYSIS")
print("="*70)

print(f"\n📊 TEST Set Statistics:")
print(f"  Total TEST records: {len(df_test)}")
print(f"  Records where only pred5 is correct: {unique_correct_count}")
print(f"  Percentage: {(unique_correct_count/len(df_test)*100):.2f}%")

print(f"\n📈 Accuracy Comparison on TEST Set:")
for pred_col in ['pred0', 'pred1', 'pred2', 'pred3', 'pred4', 'pred5_clf_10']:
    correct = (df_test[pred_col] == df_test[TARGET_COL]).sum()
    accuracy = correct / len(df_test)
    print(f"  {pred_col:15s}: {accuracy:.4f} ({accuracy*100:5.2f}%) - {correct:4d}/{len(df_test)} correct")

print(f"\n📋 Breakdown on TEST Set:")
print(f"  pred5_clf_10 correct: {df_test['pred5_correct'].sum()}")
print(f"  pred0 incorrect: {df_test['pred0_incorrect'].sum()}")
print(f"  pred1 incorrect: {df_test['pred1_incorrect'].sum()}")
print(f"  pred2 incorrect: {df_test['pred2_incorrect'].sum()}")
print(f"  pred3 incorrect: {df_test['pred3_incorrect'].sum()}")
print(f"  pred4 incorrect: {df_test['pred4_incorrect'].sum()}")

# Sample records
print(f"\n📝 Sample TEST records where ONLY pred5_clf_10 is correct (first 5):")
sample_unique = df_test[df_test['only_pred5_is_correct']].head()
if len(sample_unique) > 0:
    display_cols = [TARGET_COL, 'pred0', 'pred1', 'pred2', 'pred3', 'pred4', 'pred5_clf_10']
    print(sample_unique[display_cols])
else:
    print("  No records found")

print("\n✅ ANALYSIS COMPLETE!")
print("="*70)



STEP 2: IDENTIFY UNIQUE CORRECT PREDICTIONS
✓ Created correctness indicator columns
✓ Created 'only_pred5_is_correct' column
  Total records (all data) where ONLY pred5 is correct: 38847

STEP 3: COUNT UNIQUE CORRECT PREDICTIONS ON TEST SET

✓ Filtered to TEST dataset: 216892 records

🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯
║                                                                    ║
║                    FINAL ANSWER - QUESTION 3                       ║
║                                                                    ║
║  Number of TEST records where pred5_clf_10 is uniquely correct:  ║
║  (correct while all pred0-pred4 are incorrect)                   ║
║                                                                    ║
║                               36583                                ║
║                                                                    ║
🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯

ADDITIONAL ANALYSIS

📊 TEST Set Statistics:
  Total TEST records: 216892
  R

In [7]:
"""
COMPLETE SOLUTION: Question 4 - Hyperparameter Tuning for Decision Tree
HW3 - Stock Prediction Analysis

This solution:
1. Loads data and predictions from previous questions
2. Tunes max_depth from 1 to 20
3. Finds optimal depth based on TEST precision
4. Creates pred6_clf_best with optimized model
5. Compares all predictions (pred0-pred6)
"""

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import precision_score, accuracy_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("QUESTION 4: DECISION TREE HYPERPARAMETER TUNING")
print("="*80)

# ============================================================================
# STEP 0: VERIFY DATA AND PREDICTIONS EXIST
# ============================================================================

print("\nSTEP 0: Verifying Prerequisites")
print("="*80)

# Check if we're continuing from Question 3
if 'df' not in dir() or 'new_df' not in dir():
    print("⚠️  Data not found. Please run Questions 2 and 3 first!")
    print("   This question builds on the previous work.")

QUESTION 4: DECISION TREE HYPERPARAMETER TUNING

STEP 0: Verifying Prerequisites
⚠️  Data not found. Please run Questions 2 and 3 first!
   This question builds on the previous work.
