<a href="https://colab.research.google.com/github/Legajo/Colab-Notebooks/blob/main/HW3_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# ============================================================================
# HOMEWORK: Time Series Modeling with Decision Trees
# Based on: [2025]_Module_3_Colab_Time_Series_Modeling.ipynb
# Location: My Drive\Colab-Notebooks
# ============================================================================

# ============================================================================
# STAGE 0: SETUP AND DATA LOADING
# ============================================================================

print("=" * 70)
print("STAGE 0: SETUP AND DATA LOADING")
print("=" * 70)

# Import libraries FIRST (before anything else)
print("\n✓ Importing libraries...")
import pandas as pd
import numpy as np
import datetime
import calendar
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score

print("✓ Libraries imported successfully")

# Mount Google Drive
print("\n✓ Mounting Google Drive...")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✓ Google Drive mounted")
except:
    print("⚠️  Not in Colab environment or Drive already mounted")

# Install required packages
print("\n✓ Installing packages...")
!pip install gdown -q

# Check if file exists, if not download it
import os

# Try Google Drive first (permanent storage)
drive_path = "/content/drive/MyDrive/Colab-Notebooks/stocks_df_combined_2025_06_13.parquet.brotli"
content_path = "/content/stocks_df_combined_2025_06_13.parquet.brotli"

if os.path.exists(drive_path):
    print(f"\n✓ File found in Google Drive")
    file_path = drive_path
elif os.path.exists(content_path):
    print(f"\n✓ File found in /content/")
    file_path = content_path
else:
    print(f"\n⚠️  File not found. Downloading to /content/...")
    !gdown https://drive.google.com/file/d/1mb0ae2M5AouSDlqcUnIwaHq7avwGNrmB/view?usp=sharing --fuzzy -O /content/
    file_path = content_path

    if os.path.exists(content_path):
        file_size = os.path.getsize(content_path) / (1024*1024)
        print(f"✓ File downloaded successfully! Size: {file_size:.2f} MB")
    else:
        raise FileNotFoundError("❌ Download failed! Please check your internet connection.")

# Load data
print(f"\nLoading data from: {file_path}")
df_full = pd.read_parquet(file_path)
print(f"✓ Loaded df_full: {df_full.shape}")

# Truncate to 25 years (2000+)
df = df_full[df_full.Date >= '2000-01-01'].copy()
print(f"✓ Truncated to df: {df.shape}")

# Define variable lists (from Module 3)
GROWTH = [g for g in df.keys() if (g.find('growth_')==0)&(g.find('future')<0)]
OHLCV = ['Open','High','Low','Close','Adj Close_x','Volume']
CATEGORICAL = ['Month', 'Weekday', 'Ticker', 'ticker_type']
TO_PREDICT = [g for g in df.keys() if (g.find('future')>=0)]
TO_DROP = ['Year','Date','index_x', 'index_y', 'index', 'Quarter','Adj Close_y'] + CATEGORICAL + OHLCV

# Create custom features
df['ln_volume'] = df.Volume.apply(lambda x: np.log(x) if x > 0 else 0)

CUSTOM_NUMERICAL = ['SMA10', 'SMA20', 'growing_moving_average', 'high_minus_low_relative','volatility', 'ln_volume']

TECHNICAL_INDICATORS = ['adx', 'adxr', 'apo', 'aroon_1','aroon_2', 'aroonosc',
 'bop', 'cci', 'cmo','dx', 'macd', 'macdsignal', 'macdhist', 'macd_ext',
 'macdsignal_ext', 'macdhist_ext', 'macd_fix', 'macdsignal_fix',
 'macdhist_fix', 'mfi', 'minus_di', 'mom', 'plus_di', 'dm', 'ppo',
 'roc', 'rocp', 'rocr', 'rocr100', 'rsi', 'slowk', 'slowd', 'fastk',
 'fastd', 'fastk_rsi', 'fastd_rsi', 'trix', 'ultosc', 'willr',
 'ad', 'adosc', 'obv', 'atr', 'natr', 'ht_dcperiod', 'ht_dcphase',
 'ht_phasor_inphase', 'ht_phasor_quadrature', 'ht_sine_sine', 'ht_sine_leadsine',
 'ht_trendmod', 'avgprice', 'medprice', 'typprice', 'wclprice']

TECHNICAL_PATTERNS = [g for g in df.keys() if g.find('cdl')>=0]

MACRO = ['gdppot_us_yoy', 'gdppot_us_qoq', 'cpi_core_yoy', 'cpi_core_mom', 'FEDFUNDS',
 'DGS1', 'DGS5', 'DGS10']

NUMERICAL = GROWTH + TECHNICAL_INDICATORS + TECHNICAL_PATTERNS + CUSTOM_NUMERICAL + MACRO

print(f"\n✓ Features defined: {len(NUMERICAL)} numerical, {len(CATEGORICAL)} categorical")





STAGE 0: SETUP AND DATA LOADING

✓ Importing libraries...
✓ Libraries imported successfully

✓ Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Google Drive mounted

✓ Installing packages...

✓ File found in /content/

Loading data from: /content/stocks_df_combined_2025_06_13.parquet.brotli
✓ Loaded df_full: (230262, 203)
✓ Truncated to df: (191795, 203)

✓ Features defined: 184 numerical, 4 categorical


In [18]:

# ============================================================================
# STAGE 1: QUESTION 1 - DUMMIES FOR MONTH AND WEEK-OF-MONTH
# ============================================================================

print("\n" + "="*70)
print("STAGE 1: QUESTION 1 - Month and Week-of-Month Dummies")
print("="*70)

# Step 1: Define week_of_month function (CALENDAR-BASED APPROACH)
# Reference: https://stackoverflow.com/questions/3806473/
def week_of_month(tgtdate):
    """
    Calculate week of month based on calendar weeks.
    Finds the first day where (day - weekday) > 0, then calculates week number.
    """
    if isinstance(tgtdate, pd.Timestamp):
        tgtdate = tgtdate.to_pydatetime()

    days_this_month = calendar.monthrange(tgtdate.year, tgtdate.month)[1]

    # Find the first day where (day - weekday) > 0
    for i in range(1, days_this_month + 1):
        d = datetime.datetime(tgtdate.year, tgtdate.month, i)
        if d.day - d.weekday() > 0:
            startdate = d
            break
    else:
        # If no such day found, use first day of month
        startdate = datetime.datetime(tgtdate.year, tgtdate.month, 1)

    # Calculate week number using modulo 7 approach
    return (tgtdate - startdate).days // 7 + 1

# Step 2: Create month_wom feature
df['Date'] = pd.to_datetime(df['Date'])
df.loc[:, 'Month'] = df['Date'].dt.strftime('%B')
df.loc[:, 'Weekday'] = df['Date'].dt.day_name().astype(str)
df['calendar_wom'] = df['Date'].apply(week_of_month)
df['month_wom'] = df['Month'] + '_w' + df['calendar_wom'].astype(str)

print(f"\n✓ Created month_wom feature (calendar-based)")
print(f"  Total unique combinations: {df['month_wom'].nunique()}")
print(f"\n  Sample values:")
print(df['month_wom'].value_counts().head(10))

# Step 3: Update CATEGORICAL list
CATEGORICAL = ['Month', 'Weekday', 'Ticker', 'ticker_type', 'month_wom']
print(f"\n✓ Updated CATEGORICAL: {CATEGORICAL}")

# Step 4: Generate dummy variables
print("\n✓ Generating dummy variables...")
dummy_variables = pd.get_dummies(df[CATEGORICAL], dtype='int32')
DUMMIES = dummy_variables.keys().to_list()

print(f"  Total dummy variables: {len(DUMMIES)}")

# Filter month_wom dummies
month_wom_dummies = [col for col in DUMMIES if col.startswith('month_wom_')]
print(f"  month_wom dummies: {len(month_wom_dummies)}")

# Concatenate dummies with original dataframe
df_with_dummies = pd.concat([df, dummy_variables], axis=1)

# Step 5-8: Calculate correlations and find highest absolute correlation
print("\n✓ Calculating correlations...")
correlation_matrix = df_with_dummies[DUMMIES + ['is_positive_growth_30d_future']].corr()
correlations = correlation_matrix['is_positive_growth_30d_future']

month_wom_correlations = correlations[month_wom_dummies]

# Create dataframe with abs_corr column
month_wom_corr_df = pd.DataFrame({
    'feature': month_wom_correlations.index,
    'correlation': month_wom_correlations.values
})
month_wom_corr_df['abs_corr'] = month_wom_corr_df['correlation'].abs()
month_wom_corr_df = month_wom_corr_df.sort_values('abs_corr', ascending=False)

# Display results
print("\n" + "="*70)
print("Top 10 month_wom dummies by absolute correlation:")
print("="*70)
print(month_wom_corr_df.head(10).to_string(index=False))

max_abs_corr = month_wom_corr_df['abs_corr'].iloc[0]
best_feature = month_wom_corr_df['feature'].iloc[0]

print("\n" + "="*70)
print("🎯 ANSWER FOR QUESTION 1:")
print("="*70)
print(f"Feature: {best_feature}")
print(f"Correlation: {month_wom_corr_df['correlation'].iloc[0]:.6f}")
print(f"\n✅ ABSOLUTE CORRELATION (3 decimals): {max_abs_corr:.3f}")
print("="*70)

# Update new_df for next stages
new_df = df_with_dummies.copy()



STAGE 1: QUESTION 1 - Month and Week-of-Month Dummies


  df.loc[:, 'Month'] = df['Date'].dt.strftime('%B')
  df.loc[:, 'Weekday'] = df['Date'].dt.day_name().astype(str)



✓ Created month_wom feature (calendar-based)
  Total unique combinations: 72

  Sample values:
month_wom
June_w2        3889
May_w2         3874
May_w3         3873
June_w1        3867
February_w1    3863
January_w2     3858
February_w2    3842
May_w1         3836
March_w2       3813
March_w1       3812
Name: count, dtype: int64

✓ Updated CATEGORICAL: ['Month', 'Weekday', 'Ticker', 'ticker_type', 'month_wom']

✓ Generating dummy variables...
  Total dummy variables: 127
  month_wom dummies: 72

✓ Calculating correlations...

Top 10 month_wom dummies by absolute correlation:
               feature  correlation  abs_corr
 month_wom_November_w3     0.024099  0.024099
  month_wom_October_w4     0.022408  0.022408
  month_wom_October_w3     0.020813  0.020813
 month_wom_November_w2     0.019720  0.019720
  month_wom_January_w2    -0.017660  0.017660
  month_wom_January_w4    -0.016240  0.016240
 month_wom_February_w0    -0.015121  0.015121
month_wom_September_w2    -0.014433  0.014433
   

In [19]:
# ============================================================================
# STAGE 2: ADD MORE HANDCRAFTED RULES (TODO 2)
# ============================================================================

print("\n" + "="*70)
print("STAGE 2: Define Handcrafted Prediction Rules")
print("="*70)

# Original manual predictions from Module 3
new_df['pred0_manual_cci'] = (new_df.cci > 200).astype(int)
new_df['pred1_manual_prev_g1'] = (new_df.growth_30d > 1).astype(int)
new_df['pred2_manual_prev_g1_and_snp'] = ((new_df['growth_30d'] > 1) & (new_df['growth_snp500_30d'] > 1)).astype(int)

# TODO 2: Add more handcrafted rules
# Rule 3: RSI oversold (RSI < 30) suggests potential upward movement
new_df['pred3_manual_rsi_oversold'] = (new_df.rsi < 30).astype(int)

# Rule 4: Positive momentum with volume increase
new_df['pred4_manual_mom_volume'] = ((new_df.mom > 0) & (new_df.ln_volume > new_df.ln_volume.rolling(30).mean())).astype(int)

# Rule 5: October/November positive bias (from correlation analysis)
new_df['pred5_manual_oct_nov'] = ((new_df.Month == 'October') | (new_df.Month == 'November')).astype(int)

# Rule 6: Strong uptrend (SMA10 > SMA20 and positive growth)
new_df['pred6_manual_sma_uptrend'] = ((new_df.SMA10 > new_df.SMA20) & (new_df.growth_30d > 0)).astype(int)

# Rule 7: MACD bullish crossover
new_df['pred7_manual_macd_cross'] = (new_df.macd > new_df.macdsignal).astype(int)

PREDICTIONS = [k for k in new_df.keys() if k.startswith('pred')]
print(f"\n✓ Created {len(PREDICTIONS)} prediction rules:")
for pred in PREDICTIONS:
    print(f"  - {pred}")

# Generate is_correct columns for all predictions
for pred in PREDICTIONS:
    part1 = pred.split('_')[0]
    new_df[f'is_correct_{part1}'] = (new_df[pred] == new_df.is_positive_growth_30d_future).astype(int)

IS_CORRECT = [k for k in new_df.keys() if k.startswith('is_correct_')]



STAGE 2: Define Handcrafted Prediction Rules

✓ Created 8 prediction rules:
  - pred0_manual_cci
  - pred1_manual_prev_g1
  - pred2_manual_prev_g1_and_snp
  - pred3_manual_rsi_oversold
  - pred4_manual_mom_volume
  - pred5_manual_oct_nov
  - pred6_manual_sma_uptrend
  - pred7_manual_macd_cross


In [20]:
# ============================================================================
# STAGE 3: TEMPORAL SPLIT (from Module 3)
# ============================================================================

print("\n" + "="*70)
print("STAGE 3: Temporal Train/Validation/Test Split")
print("="*70)

def temporal_split(df, min_date, max_date, train_prop=0.7, val_prop=0.15, test_prop=0.15):
    train_end = min_date + pd.Timedelta(days=(max_date - min_date).days * train_prop)
    val_end = train_end + pd.Timedelta(days=(max_date - min_date).days * val_prop)

    split_labels = []
    for date in df['Date']:
        if date <= train_end:
            split_labels.append('train')
        elif date <= val_end:
            split_labels.append('validation')
        else:
            split_labels.append('test')

    df['split'] = split_labels
    return df

min_date_df = new_df.Date.min()
max_date_df = new_df.Date.max()

new_df = temporal_split(new_df, min_date=min_date_df, max_date=max_date_df)

split_counts = new_df['split'].value_counts()
print(f"\n✓ Split distribution:")
for split in ['train', 'validation', 'test']:
    count = split_counts[split]
    pct = count / len(new_df) * 100
    print(f"  {split}: {count} ({pct:.1f}%)")



STAGE 3: Temporal Train/Validation/Test Split

✓ Split distribution:
  train: 129730 (67.6%)
  validation: 30657 (16.0%)
  test: 31408 (16.4%)


In [21]:
# ============================================================================
# STAGE 4: EVALUATE MANUAL PREDICTIONS ON TEST SET
# ============================================================================

print("\n" + "="*70)
print("STAGE 4: Evaluate Manual Predictions (Test Set)")
print("="*70)

print(f"\n{'Prediction':<35} {'Precision':<15} {'Count (pred=1)'}")
print("-" * 65)

for i, is_correct_col in enumerate(IS_CORRECT):
    prediction_col = PREDICTIONS[i]
    filter_pred = (new_df.split == 'test') & (new_df[prediction_col] == 1)

    if filter_pred.sum() > 0:
        precision = new_df[filter_pred][is_correct_col].mean()
        count = filter_pred.sum()
        print(f"{prediction_col:<35} {precision:.4f}          {count}")
    else:
        print(f"{prediction_col:<35} N/A (no predictions)")


STAGE 4: Evaluate Manual Predictions (Test Set)

Prediction                          Precision       Count (pred=1)
-----------------------------------------------------------------
pred0_manual_cci                    0.5579          794
pred1_manual_prev_g1                0.5418          17991
pred2_manual_prev_g1_and_snp        0.5225          13367
pred3_manual_rsi_oversold           0.5357          911
pred4_manual_mom_volume             0.5517          7401
pred5_manual_oct_nov                0.6762          5518
pred6_manual_sma_uptrend            0.5409          17488
pred7_manual_macd_cross             0.5532          15809


In [24]:
# ============================================================================
# STAGE 5: TRAIN DECISION TREE MODELS (TODO 3)
# ============================================================================

print("\n" + "="*70)
print("STAGE 5: Train Decision Tree Models")
print("="*70)

# Prepare data
features_list = NUMERICAL + DUMMIES
to_predict = 'is_positive_growth_30d_future'

train_df = new_df[new_df.split.isin(['train', 'validation'])].copy(deep=True)
test_df = new_df[new_df.split.isin(['test'])].copy(deep=True)

X_train = train_df[features_list + [to_predict, 'Date', 'Ticker']]
X_test = test_df[features_list + [to_predict, 'Date', 'Ticker']]

# Clean data
pd.options.mode.chained_assignment = None
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

y_train = X_train[to_predict]
y_test = X_test[to_predict]

del X_train[to_predict]
del X_test[to_predict]

print(f"\n✓ X_train: {X_train.shape}")
print(f"✓ X_test: {X_test.shape}")

# Function to train and evaluate decision tree
def train_and_evaluate_tree(X_train, y_train, X_test, y_test, max_depth, name):
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    clf.fit(X_train.drop(['Date', 'Ticker'], axis=1), y_train)

    y_pred = clf.predict(X_test.drop(['Date', 'Ticker'], axis=1))

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    print(f"\n{name} (depth={max_depth}):")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")

    return clf, y_pred, accuracy, precision

# Train models with different depths
print("\n" + "-"*70)
print("Training Decision Trees with different depths:")
print("-"*70)

clf_5, pred_5, acc_5, prec_5 = train_and_evaluate_tree(X_train, y_train, X_test, y_test, 5, "Tree depth 5")
clf_10, pred_10, acc_10, prec_10 = train_and_evaluate_tree(X_train, y_train, X_test, y_test, 10, "Tree depth 10")
clf_20, pred_20, acc_20, prec_20 = train_and_evaluate_tree(X_train, y_train, X_test, y_test, 20, "Tree depth 20")

# Store predictions separately (don't add to X_test to avoid feature mismatch)
test_predictions = {
    'pred_tree_clf5': pred_5,
    'pred_tree_clf10': pred_10,
    'pred_tree_clf20': pred_20
}

print(f"\n✓ Predictions stored for later analysis")



STAGE 5: Train Decision Tree Models

✓ X_train: (160387, 313)
✓ X_test: (31408, 313)

----------------------------------------------------------------------
Training Decision Trees with different depths:
----------------------------------------------------------------------

Tree depth 5 (depth=5):
  Accuracy:  0.5991
  Precision: 0.6278

Tree depth 10 (depth=10):
  Accuracy:  0.5550
  Precision: 0.5871

Tree depth 20 (depth=20):
  Accuracy:  0.5399
  Precision: 0.5828

✓ Predictions stored for later analysis


In [25]:
# ============================================================================
# STAGE 6: TODO 3 - HYPERPARAMETER TUNING
# ============================================================================

print("\n" + "="*70)
print("STAGE 6: Hyperparameter Tuning (Find Best Depth)")
print("="*70)

# Split train into train and validation
train_only_df = new_df[new_df.split == 'train'].copy()
valid_df = new_df[new_df.split == 'validation'].copy()

X_train_only = train_only_df[features_list + [to_predict, 'Date', 'Ticker']]
X_valid = valid_df[features_list + [to_predict, 'Date', 'Ticker']]

X_train_only.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train_only.fillna(0, inplace=True)
X_valid.fillna(0, inplace=True)

y_train_only = X_train_only[to_predict]
y_valid = X_valid[to_predict]

del X_train_only[to_predict]
del X_valid[to_predict]

# Test depths from 1 to 20
depths = range(1, 21)
results = []

print("\nTesting depths 1-20 on validation set:")
print(f"\n{'Depth':<8} {'Accuracy':<12} {'Precision':<12}")
print("-" * 35)

for depth in depths:
    clf = DecisionTreeClassifier(max_depth=depth, random_state=42)
    clf.fit(X_train_only.drop(['Date', 'Ticker'], axis=1), y_train_only)

    y_pred_valid = clf.predict(X_valid.drop(['Date', 'Ticker'], axis=1))

    accuracy = accuracy_score(y_valid, y_pred_valid)
    precision = precision_score(y_valid, y_pred_valid)

    results.append({
        'depth': depth,
        'accuracy': accuracy,
        'precision': precision
    })

    print(f"{depth:<8} {accuracy:<12.4f} {precision:<12.4f}")

results_df = pd.DataFrame(results)

# Find best depth by accuracy
best_idx_acc = results_df['accuracy'].idxmax()
best_depth_acc = results_df.loc[best_idx_acc, 'depth']
best_accuracy = results_df.loc[best_idx_acc, 'accuracy']

# Find best depth by precision
best_idx_prec = results_df['precision'].idxmax()
best_depth_prec = results_df.loc[best_idx_prec, 'depth']
best_precision = results_df.loc[best_idx_prec, 'precision']

print("\n" + "="*70)
print("🎯 HYPERPARAMETER TUNING RESULTS:")
print("="*70)
print(f"Best depth by Accuracy:  {int(best_depth_acc)} (accuracy: {best_accuracy:.4f})")
print(f"Best depth by Precision: {int(best_depth_prec)} (precision: {best_precision:.4f})")
print("="*70)

# Visualize results
fig = go.Figure()
fig.add_trace(go.Scatter(x=results_df['depth'], y=results_df['accuracy'],
                         mode='lines+markers', name='Accuracy'))
fig.add_trace(go.Scatter(x=results_df['depth'], y=results_df['precision'],
                         mode='lines+markers', name='Precision'))
fig.update_layout(title='Decision Tree Performance vs Depth',
                  xaxis_title='Tree Depth',
                  yaxis_title='Score',
                  hovermode='x')
fig.show()

# Train best model on full train+validation and evaluate on test
print(f"\n✓ Training final model with best depth ({int(best_depth_acc)}) on full train+validation...")
clf_best, pred_best, acc_best, prec_best = train_and_evaluate_tree(
    X_train, y_train, X_test, y_test, int(best_depth_acc), f"Best Tree"
)

print("\n✅ All stages completed!")


STAGE 6: Hyperparameter Tuning (Find Best Depth)

Testing depths 1-20 on validation set:

Depth    Accuracy     Precision   
-----------------------------------
1        0.5622       0.6279      
2        0.5622       0.6279      
3        0.5623       0.6276      
4        0.5090       0.6238      
5        0.5248       0.6407      
6        0.5453       0.6431      
7        0.5052       0.6197      
8        0.5155       0.6223      
9        0.5092       0.6180      
10       0.5037       0.6245      
11       0.4942       0.6184      
12       0.5118       0.6375      
13       0.5364       0.6587      
14       0.5341       0.6505      
15       0.5049       0.6391      
16       0.5030       0.6293      
17       0.5025       0.6352      
18       0.5317       0.6455      
19       0.5088       0.6363      
20       0.5033       0.6361      

🎯 HYPERPARAMETER TUNING RESULTS:
Best depth by Accuracy:  3 (accuracy: 0.5623)
Best depth by Precision: 13 (precision: 0.6587)



✓ Training final model with best depth (3) on full train+validation...

Best Tree (depth=3):
  Accuracy:  0.5511
  Precision: 0.5511

✅ All stages completed!
