In [None]:
import os
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# --------------------------- Configuration --------------------------- #
data_dir = "/home/ironman/Desktop/qrt-quant-quest-iit-bombay-2025/"

features_list = [
    'relative_strength_index',
    'williams_r',
    'volatility_20',
    'volatility_60',
    'trend_1_3',
    'trend_5_20',
    'trend_20_60',
    'average_true_range',
    'macd',
    'trix',
    'commodity_channel_index',
    'chande_momentum_oscillator',
    'ichimoku',
    'know_sure_thing',
    'ultimate_oscillator',
    'aroon',
    'stochastic_oscillator',
    'on_balance_volume',
    'ease_of_movement',
    'chaikin_money_flow',
    'accumulation_distribution_index',
    'volume'
]

# --------------------------- Data Loading --------------------------- #
features = pd.read_parquet(os.path.join(data_dir, "features.parquet"))
returns = pd.read_parquet(os.path.join(data_dir, "returns.parquet"))

: 

In [None]:
# --------------------------- Prepare Shifted Features --------------------------- #
shifted_features = {}
for feat in features_list:
    if feat not in features.columns:
        raise ValueError(f"Expected feature '{feat}' not found in features.parquet.")
    shifted_features[feat] = features[feat].shift(1)

# --------------------------- Flatten Data --------------------------- #
def flatten_df(df, feature_name):
    """
    Flatten a DataFrame (index: dates, columns: stocks) into long format.
    Returns a DataFrame with columns: ['date', 'stock', feature_name].
    """
    df_flat = df.stack().reset_index()
    df_flat.columns = ['date', 'stock', feature_name]
    return df_flat

# Flatten each of the 22 shifted features.
dfs = []
for feat in features_list:
    df_feat = flatten_df(shifted_features[feat], feat)
    df_feat['date'] = pd.to_datetime(df_feat['date'])
    dfs.append(df_feat)
    
# Flatten returns (target variable) — note: returns are not shifted.
df_returns = flatten_df(returns, 'return')
df_returns['date'] = pd.to_datetime(df_returns['date'])
dfs.append(df_returns)

# Merge all flattened DataFrames on ['date', 'stock'] using an outer join
df_all = reduce(lambda left, right: pd.merge(left, right, on=['date', 'stock'], how='outer'), dfs)
df_all['date'] = pd.to_datetime(df_all['date'])

print("Combined data shape:", df_all.shape)
# Split data into training (<=2019) and extra (>2019) periods.
df_train = df_all[df_all['date'] <= pd.Timestamp("2019-12-31")]
df_extra = df_all[df_all['date'] > pd.Timestamp("2019-12-31")]

print("Training data shape:", df_train.shape)
print("Extra prediction data shape:", df_extra.shape)

In [None]:
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------------------------
# Determine Optimal Number of PCA Components Using Elbow Curve
# ---------------------------
def determine_pca_components(X_train_scaled, max_components=10):
    """
    Determine the optimal number of PCA components using the elbow method.

    Parameters:
    -----------
    X_train_scaled : np.ndarray
        Scaled training data.
    max_components : int
        Maximum number of PCA components to consider.

    Returns:
    --------
    int
        Optimal number of PCA components.
    """
    pca = PCA(n_components=max_components)
    pca.fit(X_train_scaled)
    explained_variance = np.cumsum(pca.explained_variance_ratio_)

    # Plot the elbow curve
    plt.figure(figsize=(8, 5))
    plt.plot(range(1, max_components + 1), explained_variance, marker='o', linestyle='--')
    plt.title('Explained Variance by Number of PCA Components')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.grid()
    plt.show()

    # Choose the number of components where the curve starts to flatten
    optimal_components = np.argmax(explained_variance >= 0.95) + 1  # 95% variance threshold
    return max(5, min(optimal_components, max_components))  # Ensure it's between 5 and max_components


# ---------------------------
# Per-Stock Modeling and Prediction using DecisionTreeRegressor with PCA and Regularization
# ---------------------------
stocks = df_train['stock'].unique()
y_pred_final_list = []

for stock in stocks:
    # Get training data for the stock and sort by date.
    df_stock_train = df_train[df_train['stock'] == stock].sort_values('date')
    # Get extra data for the stock.
    df_stock_extra = df_extra[df_extra['stock'] == stock].sort_values('date')
    
    # Prepare training predictors and target.
    X_train = df_stock_train[features_list]
    y_train = df_stock_train['return']
    
    # Custom imputation using previous day's data (forward fill).
    X_train = X_train.ffill()  # Forward fill missing values.
    y_train = y_train.ffill()  # Forward fill missing values.
    
    # Replace inf/-inf with NaN and handle remaining NaN values.
    X_train = X_train.replace([np.inf, -np.inf], np.nan)
    X_train = X_train.fillna(0)  # Replace remaining NaN values with 0.
    y_train = y_train.replace([np.inf, -np.inf], np.nan)
    y_train = y_train.fillna(0)  # Replace remaining NaN values with 0.
    
    # Verify that X_train contains no invalid values.
    assert np.isfinite(X_train.values).all(), "X_train contains invalid values!"
    assert np.isfinite(y_train.values).all(), "y_train contains invalid values!"
    
    # Standardize predictors.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Determine the optimal number of PCA components.
    num_components = determine_pca_components(X_train_scaled, max_components=10)
    print(f"Optimal number of PCA components for stock {stock}: {num_components}")
    
    # Apply PCA for dimensionality reduction.
    pca = PCA(n_components=num_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    
    # Use DecisionTreeRegressor with regularization.
    model = DecisionTreeRegressor(
        max_depth=5,               # Limit the depth of the tree
        min_samples_split=10,      # Minimum samples required to split an internal node
        min_samples_leaf=5,        # Minimum samples required to be at a leaf node
        max_features='sqrt',       # Use a subset of features for splitting
        random_state=42
    )
    model.fit(X_train_pca, y_train)
    
    # Predict on training data.
    y_train_pred = model.predict(X_train_pca)
    df_stock_train = df_stock_train.copy()
    df_stock_train['predicted_return'] = y_train_pred
    
    # Prepare extra predictors for this stock.
    X_extra = df_stock_extra[features_list]
    X_extra = X_extra.ffill()  # Forward fill missing values.
    X_extra = X_extra.replace([np.inf, -np.inf], np.nan)
    X_extra = X_extra.fillna(0)  # Replace remaining NaN values with 0.
    X_extra_scaled = scaler.transform(X_extra)  # Use same scaler as training.
    X_extra_pca = pca.transform(X_extra_scaled)  # Use same PCA as training.
    
    # Predict on extra data.
    y_extra_pred = model.predict(X_extra_pca)
    df_stock_extra = df_stock_extra.copy()
    df_stock_extra['predicted_return'] = y_extra_pred
    
    # Combine training and extra predictions.
    df_stock_all_pred = pd.concat([df_stock_train[['date', 'stock', 'predicted_return']],
                                   df_stock_extra[['date', 'stock', 'predicted_return']]])
    
    y_pred_final_list.append(df_stock_all_pred)

# Concatenate predictions for all stocks.
y_pred_final = pd.concat(y_pred_final_list, ignore_index=True)
y_pred_final['date'] = pd.to_datetime(y_pred_final['date'])
y_pred_final = y_pred_final.sort_values(['date', 'stock']).reset_index(drop=True)

print("y_pred_final shape (all stocks, full period):", y_pred_final.shape)
print(y_pred_final.head())

y_pred_final shape (all stocks, full period): (10960686, 3)
        date  stock  predicted_return
0 2005-01-03      1          0.000643
1 2005-01-03      2          0.000163
2 2005-01-03      3          0.000643
3 2005-01-03      4          0.000262
4 2005-01-03      5          0.000666


In [None]:
# ---------------------------
# Ranking and Universe Filtering
# ---------------------------
y_pred_final['rank'] = y_pred_final.groupby('date')['predicted_return'].rank(ascending=False, method='min')
y_pred_final = y_pred_final.sort_values(['date', 'rank']).reset_index(drop=True)
print("Ranked predictions sample:")
print(y_pred_final.head())

universe = pd.read_parquet(os.path.join(data_dir, "universe.parquet"))
df_universe = universe.stack().reset_index()
df_universe.columns = ['date', 'stock', 'tradable']
df_universe['date'] = pd.to_datetime(df_universe['date'])

# Merge predictions with universe data and filter tradable stocks.
df_predictions_univ = pd.merge(y_pred_final, df_universe, on=['date', 'stock'], how='left')
df_predictions_univ = df_predictions_univ[df_predictions_univ['tradable'] == 1]
df_predictions_univ['rank'] = df_predictions_univ.groupby('date')['predicted_return'] \
                                                 .rank(ascending=False, method='min')
df_predictions_univ = df_predictions_univ.sort_values(['date', 'rank'])
print(df_predictions_univ.head())

Ranked predictions sample:
        date  stock  predicted_return  rank
0 2005-01-03    296          0.008871   1.0
1 2005-01-03    222          0.004981   2.0
2 2005-01-03    616          0.004169   3.0
3 2005-01-03    191          0.003556   4.0
4 2005-01-03   1269          0.003273   5.0
            date  stock  predicted_return  rank  tradable
43344 2005-02-01   1699          0.040580   1.0         1
43346 2005-02-01   1676          0.027538   2.0         1
43347 2005-02-01   1556          0.017221   3.0         1
43348 2005-02-01   1719          0.013154   4.0         1
43351 2005-02-01   1788          0.009610   5.0         1


In [None]:
def capture_positions_by_rank(df_ranked):
    """
    For each trading day, select the top 5 and bottom 5 stocks (if available)
    from the ranked predictions (df_ranked, which must have a 'predicted_return' column),
    and assign weights:
        - Top 5: +0.1 each
        - Bottom 5: -0.1 each
        - Others: 0
    This guarantees:
        - Maximum individual weight: 0.1
        - If at least 10 stocks are tradable, then unit capital = sum(|w|) = 1 
          (5×0.1 + 5×0.1) and dollar neutrality (net sum = 0).
    If fewer than 10 stocks are tradable, positions are assigned at ±0.1
    and unit capital will be less than 1.
    """
    def process_day(day_df):
        # Assume day_df already contains only tradable stocks for that day.
        day_df = day_df.copy().sort_values(by='predicted_return', ascending=False)
        n = len(day_df)
        if n >= 10:
            n_long = 5
            n_short = 5
        else:
            # Split available stocks as evenly as possible between long and short.
            n_long = n // 2
            n_short = n - n_long
        # Create a weight column: default is 0.
        day_df['weight'] = 0.0
        
        # Assign +0.1 to top n_long stocks.
        if n_long > 0:
            idx_long = day_df.index[:n_long]
            day_df.loc[idx_long, 'weight'] = 0.1
        # Assign -0.1 to bottom n_short stocks.
        if n_short > 0:
            idx_short = day_df.index[-n_short:]
            day_df.loc[idx_short, 'weight'] = -0.1
        
        # (Optional) If exactly 10 stocks, then unit capital = 1 and net sum = 0.
        # If fewer stocks, unit capital will be sum(|w|), which will be < 1.
        return day_df

    df_positions = df_ranked.groupby('date').apply(process_day).reset_index(drop=True)
    return df_positions
#     ('max_abs_weight', lambda x: x.abs().max()),
#     ('sum_abs_weight', lambda x: x.abs().sum())
# ])
# print(daily_checks.sort_values('weight_sum', ascending=False).head(20))


In [None]:
df_positions = capture_positions_by_rank(df_predictions_univ)

# Compute daily checks: 
#   - weight_sum: net sum of weights (should be near zero)
#   - max_abs_weight: maximum absolute weight (should be <= 0.1)
#   - sum_abs_weight: sum of absolute weights (should be 1)
daily_checks = df_positions.groupby('date')['weight'].agg([
    ('weight_sum', lambda x: np.sum(x)),
    ('max_abs_weight', lambda x: np.max(np.abs(x))),
    ('sum_abs_weight', lambda x: np.sum(np.abs(x)))
])
print(daily_checks.head())

  df_positions = df_ranked.groupby('date').apply(process_day).reset_index(drop=True)


            weight_sum  max_abs_weight  sum_abs_weight
date                                                  
2005-02-01         0.0             0.1             1.0
2005-02-02         0.0             0.1             1.0
2005-02-03         0.0             0.1             1.0
2005-02-04         0.0             0.1             1.0
2005-02-07         0.0             0.1             1.0


In [None]:
df_positions

Unnamed: 0,date,stock,predicted_return,rank,tradable,weight
0,2005-02-01,1699,0.040580,1.0,1,0.1
1,2005-02-01,1676,0.027538,2.0,1,0.1
2,2005-02-01,1556,0.017221,3.0,1,0.1
3,2005-02-01,1719,0.013154,4.0,1,0.1
4,2005-02-01,1788,0.009610,5.0,1,0.1
...,...,...,...,...,...,...
5037995,2025-02-07,439,-0.215124,996.0,1,-0.1
5037996,2025-02-07,1980,-0.256944,997.0,1,-0.1
5037997,2025-02-07,134,-0.264694,998.0,1,-0.1
5037998,2025-02-07,847,-0.272258,999.0,1,-0.1


In [None]:
# ---------------------------
# Portfolio Metrics and Backtesting
# ---------------------------
def calculate_metrics(df_positions, returns):
    """
    Calculates daily portfolio metrics: BookValue, Traded, GrossPnL, and NetPnL.
    """
    # Pivot df_positions so that each date is a row and each stock is a column.
    pivot_weights = df_positions.pivot(index='date', columns='stock', values='weight').fillna(0.0)
    
    # Ensure alignment with returns.
    common_dates = pivot_weights.index.intersection(returns.index)
    pivot_weights = pivot_weights.loc[common_dates].sort_index()
    returns = returns.loc[common_dates].sort_index()
    
    # BookValue: sum of absolute weights.
    book_value_series = pivot_weights.abs().sum(axis=1)
    
    # Traded: sum of absolute differences in weights from previous day.
    shifted_weights = pivot_weights.shift(1, fill_value=0.0)
    traded_series = (pivot_weights - shifted_weights).abs().sum(axis=1)
    
    # GrossPnL: weighted sum of returns.
    gross_pnl_series = (pivot_weights * returns).sum(axis=1)
    
    # NetPnL: subtract trading costs (0.01% per traded amount).
    net_pnl_series = gross_pnl_series - (0.0001 * traded_series)
    
    metrics_df = pd.DataFrame({
        'BookValue': book_value_series,
        'Traded': traded_series,
        'GrossPnL': gross_pnl_series,
        'NetPnL': net_pnl_series
    })
    
    # Compute overall turnover.
    total_traded = traded_series.sum()
    total_book_value = book_value_series.sum()
    turnover = (total_traded / total_book_value) * 100.0 if total_book_value != 0 else 0.0
    
    return metrics_df, turnover

metrics_df, turnover = calculate_metrics(df_positions, returns)
print(metrics_df.head())
print("Overall Turnover (%):", turnover)

            BookValue  Traded  GrossPnL    NetPnL
2005-02-01        1.0     1.0  0.022969  0.022869
2005-02-02        1.0     1.4 -0.000591 -0.000731
2005-02-03        1.0     1.4  0.018042  0.017902
2005-02-04        1.0     1.6  0.024100  0.023940
2005-02-07        1.0     1.4  0.023624  0.023484
Overall Turnover (%): 142.05592543275637


In [None]:
def calculate_sharpes(metrics_df):
    """
    Calculates Annualized Gross and Net Sharpe Ratios.
    """
    gross_pnl = metrics_df['GrossPnL']
    net_pnl   = metrics_df['NetPnL']
    ann_factor = np.sqrt(252)
    
    gross_mean = gross_pnl.mean()
    gross_std  = gross_pnl.std(ddof=1)
    net_mean   = net_pnl.mean()
    net_std    = net_pnl.std(ddof=1)
    
    gross_sharpe = ann_factor * (gross_mean / gross_std) if gross_std != 0 else np.nan
    net_sharpe   = ann_factor * (net_mean / net_std) if net_std != 0 else np.nan
    
    return gross_sharpe, net_sharpe

gross_sharpe, net_sharpe = calculate_sharpes(metrics_df)
print(f"Annualized Gross Sharpe Ratio: {gross_sharpe:.4f}")
print(f"Annualized Net Sharpe Ratio: {net_sharpe:.4f}")

Annualized Gross Sharpe Ratio: 12.3816
Annualized Net Sharpe Ratio: 12.3205


In [None]:
# ---------------------------
# Prepare Final Submission Weights
# ---------------------------
# Get the full list of stocks from one of the feature DataFrames.
all_stocks = features[features_list[0]].columns.tolist()
# Pivot positions to create a DataFrame with dates as rows and stocks as columns.
df_weights = df_positions.pivot(index='date', columns='stock', values='weight')
# Reindex to include all stocks (fill missing with zero).
df_weights = df_weights.reindex(columns=all_stocks, fill_value=0).fillna(0)
print("Submission weights shape:", df_weights.shape)

Submission weights shape: (5038, 2167)


In [None]:
training_dates = sorted(df_train['date'].unique())[:20]

# Create a DataFrame with zero weights for these 20 dates and with the same columns as df_weights.
df_zeros = pd.DataFrame(0.0, index=training_dates, columns=df_weights.columns)

# Concatenate the zero-weight DataFrame with the original df_weights.
# This places the 20 zero-weight days before the existing dates.
df_weights = pd.concat([df_zeros, df_weights])

# Sort the DataFrame by date to ensure proper chronological order.
df_weights = df_weights.sort_index()

df_weights.index.name = "Date"

In [None]:
df_weights.iloc[:2700, :] = 0

In [None]:
# Optionally, if required, save the submission file.
df_weights.to_csv("submission.csv")

In [None]:
df_weights

stock,1,2,3,4,5,6,7,8,9,10,...,2158,2159,2160,2161,2162,2163,2164,2165,2166,2167
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-02-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-02-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-02-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
