In [15]:
import os
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# --------------------------- Configuration --------------------------- #
data_dir = "/kaggle/input/qrt-quant-quest-iit-bombay-2025/"

features_list = [
    'relative_strength_index',
    'williams_r',
    'volatility_20',
    'volatility_60',
    'trend_1_3',
    'trend_5_20',
    'trend_20_60',
    'average_true_range',
    'macd',
    'trix',
    'commodity_channel_index',
    'chande_momentum_oscillator',
    'ichimoku',
    'know_sure_thing',
    'ultimate_oscillator',
    'aroon',
    'stochastic_oscillator',
    'on_balance_volume',
    'ease_of_movement',
    'chaikin_money_flow',
    'accumulation_distribution_index',
    'volume'
]

# --------------------------- Data Loading --------------------------- #
features = pd.read_parquet(os.path.join(data_dir, "features.parquet"))
returns = pd.read_parquet(os.path.join(data_dir, "returns.parquet"))

In [16]:
# --------------------------- Prepare Shifted Features --------------------------- #
shifted_features = {}
for feat in features_list:
    if feat not in features.columns:
        raise ValueError(f"Expected feature '{feat}' not found in features.parquet.")
    shifted_features[feat] = features[feat].shift(1)

# --------------------------- Flatten Data --------------------------- #
def flatten_df(df, feature_name):
    """
    Flatten a DataFrame (index: dates, columns: stocks) into long format.
    Returns a DataFrame with columns: ['date', 'stock', feature_name].
    """
    df_flat = df.stack().reset_index()
    df_flat.columns = ['date', 'stock', feature_name]
    return df_flat

# Flatten each of the 22 shifted features.
dfs = []
for feat in features_list:
    df_feat = flatten_df(shifted_features[feat], feat)
    df_feat['date'] = pd.to_datetime(df_feat['date'])
    dfs.append(df_feat)
    
# Flatten returns (target variable) — note: returns are not shifted.
df_returns = flatten_df(returns, 'return')
df_returns['date'] = pd.to_datetime(df_returns['date'])
dfs.append(df_returns)

# Merge all flattened DataFrames on ['date', 'stock'] using an outer join
df_all = reduce(lambda left, right: pd.merge(left, right, on=['date', 'stock'], how='outer'), dfs)
df_all['date'] = pd.to_datetime(df_all['date'])

print("Combined data shape:", df_all.shape)
# Split data into training (<=2019) and extra (>2019) periods.
df_train = df_all[df_all['date'] <= pd.Timestamp("2019-12-31")]
df_extra = df_all[df_all['date'] > pd.Timestamp("2019-12-31")]

print("Training data shape:", df_train.shape)
print("Extra prediction data shape:", df_extra.shape)

Combined data shape: (10960686, 25)
Training data shape: (8180425, 25)
Extra prediction data shape: (2780261, 25)


In [165]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# ---------------------------
# Define the Features List
# ---------------------------
features_list = [
    'chaikin_money_flow',          # Top feature based on importance
    'commodity_channel_index',     # Measures deviation from average price
    'volatility_20',               # Short-term volatility
    'know_sure_thing',             # Momentum oscillator
    'trend_1_3',                   # Short-term trend strength
    'williams_r'                   # Momentum indicator
]

# ---------------------------
# Set Fixed Number of PCA Components
# ---------------------------
NUM_COMPONENTS = 6  # Fixed number of PCA components for all stocks

# ---------------------------
# Per-Stock Modeling and Prediction using LinearRegression with PCA
# ---------------------------
stocks = df_train['stock'].unique()
y_pred_final_list = []

for stock in stocks:
    # Get training data for the stock and sort by date.
    df_stock_train = df_train[df_train['stock'] == stock].sort_values('date')
    # Get extra data for the stock.
    df_stock_extra = df_extra[df_extra['stock'] == stock].sort_values('date')
    
    # Prepare training predictors and target.
    X_train = df_stock_train[features_list]
    y_train = df_stock_train['return']
    
    # Debugging: Check if features exist in the data
    missing_features = [col for col in features_list if col not in X_train.columns]
    if missing_features:
        print(f"Missing features for stock {stock}: {missing_features}")
        continue  # Skip this stock if features are missing
    
    # Custom imputation using previous day's data (forward fill).
    X_train = X_train.ffill()  # Forward fill missing values.
    y_train = y_train.ffill()  # Forward fill missing values.
    
    # Replace inf/-inf with NaN and handle remaining NaN values.
    X_train = X_train.replace([np.inf, -np.inf], np.nan)
    X_train = X_train.fillna(0)  # Replace remaining NaN values with 0.
    y_train = y_train.replace([np.inf, -np.inf], np.nan)
    y_train = y_train.fillna(0)  # Replace remaining NaN values with 0.
    
    # Remove constant columns (zero variance).
    X_train = X_train.loc[:, (X_train != X_train.iloc[0]).any()]  # Keep only non-constant columns.
    
    # Debugging: Check if X_train is empty
    if X_train.empty:
        # print(f"X_train is empty for stock {stock}. Skipping...")
        continue
    
    # Verify that X_train contains no invalid values.
    assert np.isfinite(X_train.values).all(), f"X_train contains invalid values for stock {stock}!"
    assert np.isfinite(y_train.values).all(), f"y_train contains invalid values for stock {stock}!"
    
    # Standardize predictors.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Apply PCA for dimensionality reduction with a fixed number of components.
    pca = PCA(n_components=min(NUM_COMPONENTS, X_train_scaled.shape[1]))  # Ensure n_components <= number of features
    X_train_pca = pca.fit_transform(X_train_scaled)
    
    # Use LinearRegression.
    model = LinearRegression()
    model.fit(X_train_pca, y_train)
    
    # Predict on training data.
    y_train_pred = model.predict(X_train_pca)
    df_stock_train = df_stock_train.copy()
    df_stock_train['predicted_return'] = y_train_pred
    
    # Prepare extra predictors for this stock.
    X_extra = df_stock_extra[features_list]
    X_extra = X_extra.ffill()  # Forward fill missing values.
    X_extra = X_extra.replace([np.inf, -np.inf], np.nan)
    X_extra = X_extra.fillna(0)  # Replace remaining NaN values with 0.
    
    # Align columns of X_extra with X_train
    X_extra = X_extra[X_train.columns]  # Ensure X_extra has the same columns as X_train
    
    # Standardize extra predictors using the same scaler as training.
    X_extra_scaled = scaler.transform(X_extra)
    X_extra_pca = pca.transform(X_extra_scaled)  # Use same PCA as training.
    
    # Predict on extra data.
    y_extra_pred = model.predict(X_extra_pca)
    df_stock_extra = df_stock_extra.copy()
    df_stock_extra['predicted_return'] = y_extra_pred
    
    # Combine training and extra predictions.
    df_stock_all_pred = pd.concat([df_stock_train[['date', 'stock', 'predicted_return']],
                                   df_stock_extra[['date', 'stock', 'predicted_return']]])
    
    y_pred_final_list.append(df_stock_all_pred)

# Concatenate predictions for all stocks.
y_pred_final = pd.concat(y_pred_final_list, ignore_index=True)
y_pred_final['date'] = pd.to_datetime(y_pred_final['date'])
y_pred_final = y_pred_final.sort_values(['date', 'stock']).reset_index(drop=True)

print("y_pred_final shape (all stocks, full period):", y_pred_final.shape)
print(y_pred_final.head())

y_pred_final shape (all stocks, full period): (10277856, 3)
        date  stock  predicted_return
0 2005-01-03      1          0.000265
1 2005-01-03      2         -0.000875
2 2005-01-03      3          0.001375
3 2005-01-03      4         -0.001547
4 2005-01-03      5         -0.001256


In [166]:
# ---------------------------
# Ranking and Universe Filtering
# ---------------------------
y_pred_final['rank'] = y_pred_final.groupby('date')['predicted_return'].rank(ascending=False, method='min')
y_pred_final = y_pred_final.sort_values(['date', 'rank']).reset_index(drop=True)
print("Ranked predictions sample:")
print(y_pred_final.head())

universe = pd.read_parquet(os.path.join(data_dir, "universe.parquet"))
df_universe = universe.stack().reset_index()
df_universe.columns = ['date', 'stock', 'tradable']
df_universe['date'] = pd.to_datetime(df_universe['date'])

# Merge predictions with universe data and filter tradable stocks.
df_predictions_univ = pd.merge(y_pred_final, df_universe, on=['date', 'stock'], how='left')
df_predictions_univ = df_predictions_univ[df_predictions_univ['tradable'] == 1]
df_predictions_univ['rank'] = df_predictions_univ.groupby('date')['predicted_return'] \
                                                 .rank(ascending=False, method='min')
df_predictions_univ = df_predictions_univ.sort_values(['date', 'rank'])
print(df_predictions_univ.head())

Ranked predictions sample:
        date  stock  predicted_return  rank
0 2005-01-03    161          0.994102   1.0
1 2005-01-03    216          0.023211   2.0
2 2005-01-03   2068          0.014660   3.0
3 2005-01-03    383          0.007389   4.0
4 2005-01-03    625          0.005549   5.0
            date  stock  predicted_return  rank  tradable
40640 2005-02-01    161          1.964839   1.0         1
40643 2005-02-01   1815          0.019317   2.0         1
40654 2005-02-01    312          0.006122   3.0         1
40655 2005-02-01   1983          0.006074   4.0         1
40657 2005-02-01   1077          0.005576   5.0         1


In [167]:
def capture_positions_by_rank(df_ranked):
    """
    For each trading day, select the top 5 and bottom 5 stocks (if available)
    from the ranked predictions (df_ranked, which must have a 'predicted_return' column),
    and assign weights:
        - Top 5: +0.1 each
        - Bottom 5: -0.1 each
        - Others: 0
    This guarantees:
        - Maximum individual weight: 0.1
        - If at least 10 stocks are tradable, then unit capital = sum(|w|) = 1 
          (5×0.1 + 5×0.1) and dollar neutrality (net sum = 0).
    If fewer than 10 stocks are tradable, positions are assigned at ±0.1
    and unit capital will be less than 1.
    """
    def process_day(day_df):
        # Assume day_df already contains only tradable stocks for that day.
        day_df = day_df.copy().sort_values(by='predicted_return', ascending=False)
        n = len(day_df)
        if n >= 10:
            n_long = 5
            n_short = 5
        else:
            # Split available stocks as evenly as possible between long and short.
            n_long = n // 2
            n_short = n - n_long
        # Create a weight column: default is 0.
        day_df['weight'] = 0.0
        
        # Assign +0.1 to top n_long stocks.
        if n_long > 0:
            idx_long = day_df.index[:n_long]
            day_df.loc[idx_long, 'weight'] = 0.1
        # Assign -0.1 to bottom n_short stocks.
        if n_short > 0:
            idx_short = day_df.index[-n_short:]
            day_df.loc[idx_short, 'weight'] = -0.1
        
        # (Optional) If exactly 10 stocks, then unit capital = 1 and net sum = 0.
        # If fewer stocks, unit capital will be sum(|w|), which will be < 1.
        return day_df

    df_positions = df_ranked.groupby('date').apply(process_day).reset_index(drop=True)
    return df_positions
#     ('max_abs_weight', lambda x: x.abs().max()),
#     ('sum_abs_weight', lambda x: x.abs().sum())
# ])
# print(daily_checks.sort_values('weight_sum', ascending=False).head(20))


In [168]:
df_positions = capture_positions_by_rank(df_predictions_univ)

# Compute daily checks: 
#   - weight_sum: net sum of weights (should be near zero)
#   - max_abs_weight: maximum absolute weight (should be <= 0.1)
#   - sum_abs_weight: sum of absolute weights (should be 1)
daily_checks = df_positions.groupby('date')['weight'].agg([
    ('weight_sum', lambda x: np.sum(x)),
    ('max_abs_weight', lambda x: np.max(np.abs(x))),
    ('sum_abs_weight', lambda x: np.sum(np.abs(x)))
])
print(daily_checks.head())

  df_positions = df_ranked.groupby('date').apply(process_day).reset_index(drop=True)


            weight_sum  max_abs_weight  sum_abs_weight
date                                                  
2005-02-01         0.0             0.1             1.0
2005-02-02         0.0             0.1             1.0
2005-02-03         0.0             0.1             1.0
2005-02-04         0.0             0.1             1.0
2005-02-07         0.0             0.1             1.0


In [169]:
df_positions

Unnamed: 0,date,stock,predicted_return,rank,tradable,weight
0,2005-02-01,161,1.964839,1.0,1,0.1
1,2005-02-01,1815,0.019317,2.0,1,0.1
2,2005-02-01,312,0.006122,3.0,1,0.1
3,2005-02-01,1983,0.006074,4.0,1,0.1
4,2005-02-01,1077,0.005576,5.0,1,0.1
...,...,...,...,...,...,...
4995172,2025-02-07,803,-0.014333,960.0,1,-0.1
4995173,2025-02-07,75,-0.026238,961.0,1,-0.1
4995174,2025-02-07,2060,-0.026769,962.0,1,-0.1
4995175,2025-02-07,500,-0.029070,963.0,1,-0.1


In [170]:
# ---------------------------
# Portfolio Metrics and Backtesting
# ---------------------------
def calculate_metrics(df_positions, returns):
    """
    Calculates daily portfolio metrics: BookValue, Traded, GrossPnL, and NetPnL.
    """
    # Pivot df_positions so that each date is a row and each stock is a column.
    pivot_weights = df_positions.pivot(index='date', columns='stock', values='weight').fillna(0.0)
    
    # Ensure alignment with returns.
    common_dates = pivot_weights.index.intersection(returns.index)
    pivot_weights = pivot_weights.loc[common_dates].sort_index()
    returns = returns.loc[common_dates].sort_index()
    
    # BookValue: sum of absolute weights.
    book_value_series = pivot_weights.abs().sum(axis=1)
    
    # Traded: sum of absolute differences in weights from previous day.
    shifted_weights = pivot_weights.shift(1, fill_value=0.0)
    traded_series = (pivot_weights - shifted_weights).abs().sum(axis=1)
    
    # GrossPnL: weighted sum of returns.
    gross_pnl_series = (pivot_weights * returns).sum(axis=1)
    
    # NetPnL: subtract trading costs (0.01% per traded amount).
    net_pnl_series = gross_pnl_series - (0.0001 * traded_series)
    
    metrics_df = pd.DataFrame({
        'BookValue': book_value_series,
        'Traded': traded_series,
        'GrossPnL': gross_pnl_series,
        'NetPnL': net_pnl_series
    })
    
    # Compute overall turnover.
    total_traded = traded_series.sum()
    total_book_value = book_value_series.sum()
    turnover = (total_traded / total_book_value) * 100.0 if total_book_value != 0 else 0.0
    
    return metrics_df, turnover

metrics_df, turnover = calculate_metrics(df_positions, returns)
print(metrics_df.head())
print("Overall Turnover (%):", turnover)

            BookValue  Traded  GrossPnL    NetPnL
2005-02-01        1.0     1.0  0.004246  0.004146
2005-02-02        1.0     1.0  0.007627  0.007527
2005-02-03        1.0     1.2  0.000284  0.000164
2005-02-04        1.0     0.8 -0.007646 -0.007726
2005-02-07        1.0     1.2  0.002234  0.002114
Overall Turnover (%): 103.11051930758988


In [171]:
def calculate_sharpes(metrics_df):
    """
    Calculates Annualized Gross and Net Sharpe Ratios.
    """
    gross_pnl = metrics_df['GrossPnL']
    net_pnl   = metrics_df['NetPnL']
    ann_factor = np.sqrt(252)
    
    gross_mean = gross_pnl.mean()
    gross_std  = gross_pnl.std(ddof=1)
    net_mean   = net_pnl.mean()
    net_std    = net_pnl.std(ddof=1)
    
    gross_sharpe = ann_factor * (gross_mean / gross_std) if gross_std != 0 else np.nan
    net_sharpe   = ann_factor * (net_mean / net_std) if net_std != 0 else np.nan
    
    return gross_sharpe, net_sharpe

gross_sharpe, net_sharpe = calculate_sharpes(metrics_df)
print(f"Annualized Gross Sharpe Ratio: {gross_sharpe:.4f}")
print(f"Annualized Net Sharpe Ratio: {net_sharpe:.4f}")

Annualized Gross Sharpe Ratio: 4.2877
Annualized Net Sharpe Ratio: 4.1843


In [172]:
# ---------------------------
# Prepare Final Submission Weights
# ---------------------------
# Get the full list of stocks from one of the feature DataFrames.
all_stocks = features[features_list[0]].columns.tolist()
# Pivot positions to create a DataFrame with dates as rows and stocks as columns.
df_weights = df_positions.pivot(index='date', columns='stock', values='weight')
# Reindex to include all stocks (fill missing with zero).
df_weights = df_weights.reindex(columns=all_stocks, fill_value=0).fillna(0)
print("Submission weights shape:", df_weights.shape)

Submission weights shape: (5038, 2167)


In [173]:
training_dates = sorted(df_train['date'].unique())[:20]

# Create a DataFrame with zero weights for these 20 dates and with the same columns as df_weights.
df_zeros = pd.DataFrame(0.0, index=training_dates, columns=df_weights.columns)

# Concatenate the zero-weight DataFrame with the original df_weights.
# This places the 20 zero-weight days before the existing dates.
df_weights = pd.concat([df_zeros, df_weights])

# Sort the DataFrame by date to ensure proper chronological order.
df_weights = df_weights.sort_index()

df_weights.index.name = "Date"

In [76]:
df_weights.iloc[:2700, :] = 0

In [174]:
# Optionally, if required, save the submission file.
df_weights.to_csv("submission.csv")

In [175]:
df_weights

stock,1,2,3,4,5,6,7,8,9,10,...,2158,2159,2160,2161,2162,2163,2164,2165,2166,2167
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-02-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-02-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-02-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
