<a href="https://colab.research.google.com/github/Legajo/Colab-Notebooks/blob/main/HW3_Q3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
SIMPLIFIED SOLUTION: Question 3 - Unique Correct Predictions from Decision Tree
HW3 - Stock Prediction Analysis

This solution loads preprocessed data from Question 2 outputs (HW3_outputs folder)
"""

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("QUESTION 3: DECISION TREE UNIQUE PREDICTIONS ANALYSIS")
print("="*70)

# ============================================================================
# STEP 1: LOAD DATA FROM HW3_OUTPUTS FOLDER
# ============================================================================

print("\nSTEP 1: Loading Data from HW3_outputs folder")
print("="*70)

# Mount Google Drive
print("\n✓ Mounting Google Drive...")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✓ Google Drive mounted")
except:
    print("⚠️  Not in Colab environment or Drive already mounted")

# Load the preprocessed data from Question 2
input_file = "/content/drive/MyDrive/Colab Notebooks/HW3_outputs/new_df_with_predictions.csv"

print(f"\n✓ Loading data from: {input_file}")
df = pd.read_csv(input_file)

print(f"✓ Data loaded successfully!")
print(f"  Shape: {df.shape}")
print(f"  Columns: {len(df.columns)}")

# ============================================================================
# STEP 2: PREPARE FEATURES AND TRAIN DECISION TREE
# ============================================================================

print("\n" + "="*70)
print("STEP 2: TRAIN DECISION TREE (pred5_clf_10)")
print("="*70)

# Define feature columns (exclude predictions, target, and metadata)
exclude_cols = ['Date', 'Year', 'Quarter', 'Month', 'Weekday', 'Ticker', 'ticker_type',
                'split', 'is_positive_growth_30d_future',
                'pred0_manual_cci', 'pred1_manual_prev_g1', 'pred2_manual_prev_g1_and_snp',
                'pred3_manual_dgs10_5', 'pred4_manual_dgs10_fedfunds',
                'is_correct_pred0', 'is_correct_pred1', 'is_correct_pred2',
                'is_correct_pred3', 'is_correct_pred4']

# Get feature columns
feature_cols = [col for col in df.columns if col not in exclude_cols and
                not col.startswith('Unnamed')]

print(f"\n✓ Feature columns: {len(feature_cols)}")
print(f"  Sample features: {feature_cols[:5]}")

# Prepare train/val and test data
train_val_mask = df['split'].isin(['train', 'validation'])
test_mask = df['split'] == 'test'

print(f"\n✓ Dataset sizes:")
print(f"  Train+Val: {train_val_mask.sum()}")
print(f"  Test: {test_mask.sum()}")

# Prepare X and y
X_train_val = df.loc[train_val_mask, feature_cols].copy()
y_train_val = df.loc[train_val_mask, 'is_positive_growth_30d_future'].copy()

X_all = df[feature_cols].copy()
y_all = df['is_positive_growth_30d_future'].copy()

# Handle missing and infinite values
print("\n✓ Preprocessing features...")
X_train_val = X_train_val.replace([np.inf, -np.inf], np.nan).fillna(0)
X_all = X_all.replace([np.inf, -np.inf], np.nan).fillna(0)

# Train Decision Tree
print("\n✓ Training Decision Tree Classifier (max_depth=10)...")
clf_10 = DecisionTreeClassifier(max_depth=10, random_state=42)
clf_10.fit(X_train_val, y_train_val)

# Generate predictions
predictions_clf_10 = clf_10.predict(X_all)
df['pred5_clf_10'] = predictions_clf_10

print("✓ Model trained and predictions generated!")

# Calculate accuracies
train_val_acc = (df.loc[train_val_mask, 'pred5_clf_10'] == df.loc[train_val_mask, 'is_positive_growth_30d_future']).mean()
test_acc = (df.loc[test_mask, 'pred5_clf_10'] == df.loc[test_mask, 'is_positive_growth_30d_future']).mean()

print(f"\n✓ Model Performance:")
print(f"  Train+Val Accuracy: {train_val_acc:.4f} ({train_val_acc*100:.2f}%)")
print(f"  Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")

# ============================================================================
# STEP 3: IDENTIFY UNIQUE CORRECT PREDICTIONS
# ============================================================================

print("\n" + "="*70)
print("STEP 3: IDENTIFY UNIQUE CORRECT PREDICTIONS")
print("="*70)

# Create correctness indicators
df['pred5_correct'] = (df['pred5_clf_10'] == df['is_positive_growth_30d_future'])
df['pred0_incorrect'] = (df['pred0_manual_cci'] != df['is_positive_growth_30d_future'])
df['pred1_incorrect'] = (df['pred1_manual_prev_g1'] != df['is_positive_growth_30d_future'])
df['pred2_incorrect'] = (df['pred2_manual_prev_g1_and_snp'] != df['is_positive_growth_30d_future'])
df['pred3_incorrect'] = (df['pred3_manual_dgs10_5'] != df['is_positive_growth_30d_future'])
df['pred4_incorrect'] = (df['pred4_manual_dgs10_fedfunds'] != df['is_positive_growth_30d_future'])

print("✓ Created correctness indicator columns")

# Create unique correctness column
df['only_pred5_is_correct'] = (
    df['pred5_correct'] &
    df['pred0_incorrect'] &
    df['pred1_incorrect'] &
    df['pred2_incorrect'] &
    df['pred3_incorrect'] &
    df['pred4_incorrect']
)

print(f"✓ Created 'only_pred5_is_correct' column")

# ============================================================================
# STEP 4: COUNT UNIQUE CORRECT PREDICTIONS ON TEST SET
# ============================================================================

print("\n" + "="*70)
print("STEP 4: COUNT UNIQUE CORRECT PREDICTIONS ON TEST SET")
print("="*70)

# Filter to TEST set
df_test = df[test_mask].copy()
print(f"\n✓ Filtered to TEST dataset: {len(df_test)} records")

# Count unique correct predictions
unique_correct_count = df_test['only_pred5_is_correct'].sum()

# ============================================================================
# DISPLAY FINAL ANSWER
# ============================================================================

print("\n" + "🎯"*35)
print("║" + " "*68 + "║")
print("║" + " "*20 + "FINAL ANSWER - QUESTION 3" + " "*23 + "║")
print("║" + " "*68 + "║")
print("║  Number of TEST records where pred5_clf_10 is uniquely correct:  ║")
print("║  (correct while all pred0-pred4 are incorrect)                   ║")
print("║" + " "*68 + "║")
print(f"║{unique_correct_count:^68}║")
print("║" + " "*68 + "║")
print("🎯"*35)

# ============================================================================
# ADDITIONAL ANALYSIS
# ============================================================================

print("\n" + "="*70)
print("ADDITIONAL ANALYSIS")
print("="*70)

print(f"\n📊 TEST Set Statistics:")
print(f"  Total TEST records: {len(df_test)}")
print(f"  Records where only pred5 is correct: {unique_correct_count}")
print(f"  Percentage: {(unique_correct_count/len(df_test)*100):.2f}%")

print(f"\n📈 Accuracy Comparison on TEST Set:")
pred_cols = ['pred0_manual_cci', 'pred1_manual_prev_g1', 'pred2_manual_prev_g1_and_snp',
             'pred3_manual_dgs10_5', 'pred4_manual_dgs10_fedfunds', 'pred5_clf_10']
for pred_col in pred_cols:
    correct = (df_test[pred_col] == df_test['is_positive_growth_30d_future']).sum()
    accuracy = correct / len(df_test)
    print(f"  {pred_col:30s}: {accuracy:.4f} ({accuracy*100:5.2f}%) - {correct:4d}/{len(df_test)} correct")

print(f"\n📋 Breakdown on TEST Set:")
print(f"  pred5_clf_10 correct: {df_test['pred5_correct'].sum()}")
print(f"  pred0 incorrect: {df_test['pred0_incorrect'].sum()}")
print(f"  pred1 incorrect: {df_test['pred1_incorrect'].sum()}")
print(f"  pred2 incorrect: {df_test['pred2_incorrect'].sum()}")
print(f"  pred3 incorrect: {df_test['pred3_incorrect'].sum()}")
print(f"  pred4 incorrect: {df_test['pred4_incorrect'].sum()}")

# ============================================================================
# STEP 5: SAVE RESULTS TO HW3_OUTPUTS
# ============================================================================

print("\n" + "="*70)
print("STEP 5: SAVING RESULTS")
print("="*70)

output_dir = "/content/drive/MyDrive/Colab Notebooks/HW3_outputs/"

# Save test set with pred5 predictions
output_file_1 = output_dir + "test_set_with_pred5.csv"
df_test.to_csv(output_file_1, index=False)
print(f"\n✓ Saved test set with pred5: {output_file_1}")

# Save final answer
final_answer_df = pd.DataFrame([{
    'Question': 'Question 3',
    'Description': 'Number of TEST records where ONLY pred5_clf_10 is correct',
    'Answer': unique_correct_count
}])
output_file_2 = output_dir + "final_answer_q3.csv"
final_answer_df.to_csv(output_file_2, index=False)
print(f"✓ Saved final answer: {output_file_2}")

print("\n✅ ANALYSIS COMPLETE!")
print("="*70)

QUESTION 3: DECISION TREE UNIQUE PREDICTIONS ANALYSIS

STEP 1: Loading Data from HW3_outputs folder

✓ Mounting Google Drive...
Mounted at /content/drive
✓ Google Drive mounted

✓ Loading data from: /content/drive/MyDrive/Colab Notebooks/HW3_outputs/new_df_with_predictions.csv
✓ Data loaded successfully!
  Shape: (191795, 270)
  Columns: 270

STEP 2: TRAIN DECISION TREE (pred5_clf_10)

✓ Feature columns: 251
  Sample features: ['Open', 'High', 'Low', 'Close_x', 'Volume']

✓ Dataset sizes:
  Train+Val: 160387
  Test: 31408

✓ Preprocessing features...

✓ Training Decision Tree Classifier (max_depth=10)...
✓ Model trained and predictions generated!

✓ Model Performance:
  Train+Val Accuracy: 1.0000 (100.00%)
  Test Accuracy: 1.0000 (100.00%)

STEP 3: IDENTIFY UNIQUE CORRECT PREDICTIONS
✓ Created correctness indicator columns
✓ Created 'only_pred5_is_correct' column

STEP 4: COUNT UNIQUE CORRECT PREDICTIONS ON TEST SET

✓ Filtered to TEST dataset: 31408 records

🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯🎯