<a href="https://colab.research.google.com/github/Kathy42xu/DL_TA/blob/main/ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import os # Needed to work with file paths and check existence
import time # Optional: to time execution

# --- Configuration ---
# Path to the folder you uploaded containing the factor_x.csv files
data_folder_path = './' # Assumes 'data' folder is in the root

# Number of files
num_files = 10
file_prefix = 'factor_'
file_suffix = '.csv'

# List to hold individual DataFrames
all_data_parts = []
column_names = None # To store column names from factor_1.csv

print(f"--- Starting Data Combination ---")
start_load_time = time.time()
print(f"Attempting to load files from {data_folder_path}...")
print("Expecting header ONLY in factor_1.csv")

# Check if the data folder exists
if not os.path.isdir(data_folder_path):
    print(f"ERROR: Directory not found: {data_folder_path}")
    print("Please ensure you have uploaded the 'data' folder containing the CSV files.")
    # Handle error appropriately, maybe raise exception or exit
    combined_data = None # Set combined_data to None or handle error
else:
    # --- Load the first file (factor_1.csv) WITH header ---
    file_name_1 = f"{file_prefix}1{file_suffix}"
    file_path_1 = os.path.join(data_folder_path, file_name_1)

    if os.path.exists(file_path_1):
        print(f"  Loading {file_path_1} (with header)...")
        try:
            df_first = pd.read_csv(file_path_1)
            column_names = df_first.columns.tolist() # Store column names
            all_data_parts.append(df_first)
            print(f"    Successfully loaded {file_name_1}. Shape: {df_first.shape}")
            if column_names:
                 print(f"    Stored {len(column_names)} column names.")
            else:
                 print("    Warning: No column names found in header file.")
                 column_names = None # Treat as failure if no columns read

        except Exception as e:
            print(f"    ERROR loading header file {file_name_1}: {e}")
            column_names = None # Ensure it's None if loading failed
    else:
        print(f"  ERROR: Header file not found - {file_path_1}. Cannot proceed.")
        column_names = None # Ensure it's None if file not found

    # --- Load subsequent files (factor_2.csv to factor_10.csv) WITHOUT header ---
    if column_names is not None: # Proceed only if factor_1 was loaded successfully
        for i in range(2, num_files + 1):
            file_name = f"{file_prefix}{i}{file_suffix}"
            file_path = os.path.join(data_folder_path, file_name)

            if os.path.exists(file_path):
                print(f"  Loading {file_path} (WITHOUT header)...")
                try:
                    # Load WITHOUT header
                    df_temp = pd.read_csv(file_path, header=None)

                    # Important Check: Verify column count matches factor_1
                    if len(df_temp.columns) != len(column_names):
                         print(f"    WARNING: {file_name} has {len(df_temp.columns)} columns, but header has {len(column_names)}. Skipping this file.")
                         continue # Skip this file

                    # Assign correct column names HERE
                    df_temp.columns = column_names

                    # Add the DataFrame (now with correct names) to the list
                    all_data_parts.append(df_temp)
                    # print(f"    Successfully loaded {file_name}. Shape: {df_temp.shape}") # Reduced verbosity
                except Exception as e:
                    print(f"    ERROR loading {file_name}: {e}")
                    # continue # Optional: skip failing files
            else:
                print(f"  WARNING: File not found - {file_path}. Skipping.")

    # --- Concatenate ---
    combined_data = None # Initialize in case of errors
    if not all_data_parts:
        print("\nERROR: No data parts were loaded. Cannot concatenate.")
    elif column_names is None:
         print("\nERROR: Could not load header from factor_1.csv. Cannot proceed.")
    else:
        print("\nConcatenating all loaded data parts...")
        # Concatenate DataFrames. All parts should now have correct column names.
        combined_data = pd.concat(all_data_parts, ignore_index=True)
        print("Concatenation complete!")

        # Final check on columns (should pass now)
        if len(combined_data.columns) != len(column_names):
             print("ERROR: Final column count mismatch after assigning names. Check logic.")
             combined_data = None # Invalidate data if something went wrong
        else:
             # Display the shape and head/tail of the final combined DataFrame
             print("\nShape of the final combined DataFrame:")
             print(combined_data.shape)
             print("\nFirst 5 rows:")
             print(combined_data.head())
             print("\nLast 5 rows:")
             print(combined_data.tail())

# --- End of Data Combination Cell ---
load_end_time = time.time()
print(f"\n--- Data Combination Finished ---")
if combined_data is not None:
    print(f"Variable 'combined_data' created. Shape: {combined_data.shape}")
    print(f"Time taken: {load_end_time - start_load_time:.2f} seconds")
else:
    print("Variable 'combined_data' could not be created due to errors.")

# --- Next Steps ---
# If 'combined_data' was created successfully, you can now use it
# in the next cell for preprocessing and model fitting.
# Example for next cell:
# if combined_data is not None:
#     data = preprocess_combined_data(combined_data.copy(), ...) # Use .copy() if preprocess modifies inplace
#     # ... rest of the OLS code ...
# else:
#     print("Cannot proceed without combined_data.")

--- Starting Data Combination ---
Attempting to load files from ./...
Expecting header ONLY in factor_1.csv
  Loading ./factor_1.csv (with header)...
    Successfully loaded factor_1.csv. Shape: (33700, 138)
    Stored 138 column names.
  Loading ./factor_2.csv (WITHOUT header)...
  Loading ./factor_3.csv (WITHOUT header)...
  Loading ./factor_4.csv (WITHOUT header)...
  Loading ./factor_5.csv (WITHOUT header)...
  Loading ./factor_6.csv (WITHOUT header)...
  Loading ./factor_7.csv (WITHOUT header)...
  Loading ./factor_8.csv (WITHOUT header)...
  Loading ./factor_9.csv (WITHOUT header)...
  Loading ./factor_10.csv (WITHOUT header)...

Concatenating all loaded data parts...
Concatenation complete!

Shape of the final combined DataFrame:
(336740, 138)

First 5 rows:
         date  ticker    target  market_return  excess_market_return  \
0  2002-05-01   47080 -0.277193      -0.000009             -0.004017   
1  2002-05-01   17040 -0.384848      -0.000009             -0.004017   
2  2002-

In [16]:
# Import necessary libraries (needed again in a new cell)
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import warnings
import time

warnings.filterwarnings("ignore")

# --- Configuration ---
# (Keep the same configuration as before: data_folder_path, feature_cols, target_col, date ranges etc.)
data_folder_path = './' # Example: Adjust if your data is in a subfolder like './data/'
num_files = 10
file_prefix = 'factor_'
file_suffix = '.csv'

feature_cols = [
    'size_rnk', # Assuming this is the size feature
    'BPR',      # Assuming this is Book-to-Price
    'mom12'     # Assuming this is 12-month momentum
]
target_col = 'target'
date_col = 'date'
ticker_col = 'ticker'

# Define date ranges based on your last code block (adjust if needed)
train_start_year = 2002
train_end_year = 2015
test_start_year = 2019
test_end_year = 2020


# --- Functions ---
# (load_and_combine_data function would typically be in the previous cell)

# MODIFIED PREPROCESSING FUNCTION - ADDED pd.to_numeric
def preprocess_combined_data(df, date_col, ticker_col, target_col, feature_cols):
    """Preprocesses the combined dataframe with numeric conversion and median+0 imputation for features."""
    print("--- Starting Data Preprocessing ---")
    if df is None:
        print("ERROR: Input DataFrame is None.")
        return None
    if not isinstance(df, pd.DataFrame):
        print("ERROR: Input is not a pandas DataFrame.")
        return None

    df_processed = df.copy()
    print(f"Input shape: {df_processed.shape}")

    # Convert date column if it's not the index already
    if date_col in df_processed.columns:
        print(f"Converting '{date_col}' to datetime...")
        df_processed[date_col] = pd.to_datetime(df_processed[date_col], errors='coerce')
        df_processed = df_processed.dropna(subset=[date_col])
        df_processed = df_processed.set_index(date_col)
    elif not isinstance(df_processed.index, pd.DatetimeIndex):
        print(f"ERROR: DataFrame index is not a DatetimeIndex and '{date_col}' column not found.")
        return None
    print("Date index set.")

    # Check for required columns AFTER potential date conversion drops
    all_cols_needed = [ticker_col, target_col] + feature_cols
    missing_cols = [col for col in all_cols_needed if col not in df_processed.columns]
    if missing_cols:
        print(f"ERROR: Missing required columns: {missing_cols}")
        return None

    # Select necessary columns AFTER setting index
    df_processed = df_processed[[ticker_col, target_col] + feature_cols]

    # --- Force Feature Columns to Numeric (Convert non-numeric to NaN) ---
    print("Attempting to convert feature columns to numeric...")
    coerced_count = 0
    for col in feature_cols:
         if col in df_processed.columns:
              initial_non_numeric = pd.to_numeric(df_processed[col], errors='coerce').isna().sum() - df_processed[col].isna().sum()
              if initial_non_numeric > 0:
                   print(f"  Found {initial_non_numeric} non-numeric value(s) in '{col}'. Coercing to NaN.")
                   coerced_count += initial_non_numeric
              df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
         else:
              print(f"Warning: Feature column {col} not found during numeric conversion.")
    if coerced_count > 0:
         print(f"Coerced a total of {coerced_count} non-numeric entries to NaN across features.")

    # Handle infinite values (do this *after* to_numeric)
    df_processed.replace([np.inf, -np.inf], np.nan, inplace=True)

    # --- Handle TARGET Variable Missing Values ---
    initial_rows = len(df_processed)
    n_target_na_initial = df_processed[target_col].isna().sum()
    if n_target_na_initial > 0:
         print(f"Found {n_target_na_initial} missing values in '{target_col}'. Dropping rows...")
    df_processed.dropna(subset=[target_col], inplace=True)
    rows_after_target_drop = len(df_processed)
    rows_dropped = initial_rows - rows_after_target_drop
    if rows_dropped > 0:
         print(f"Dropped {rows_dropped} rows due to missing '{target_col}'.")
    print(f"Shape after handling target NaNs: {df_processed.shape}")
    if rows_after_target_drop == 0:
         print("ERROR: All rows dropped due to missing target. Cannot proceed.")
         return None

    # --- Handle FEATURE Missing Values (Median then 0 Imputation) ---
    # Now includes NaNs created by coercing strings
    print("Applying Median+0 imputation to features (including coerced NaNs)...")
    df_processed['month_period'] = df_processed.index.to_period('M')
    imputed_count_zero = 0
    for col in feature_cols:
        if col in df_processed.columns:
            n_initial_na = df_processed[col].isna().sum()
            if n_initial_na > 0:
                 # print(f"  Processing feature '{col}': Found {n_initial_na} NaNs.")
                 # Use transform for median imputation
                 df_processed[col] = df_processed.groupby('month_period')[col].transform(lambda x: x.fillna(x.median()))
                 n_remaining_na = df_processed[col].isna().sum()
                 if n_remaining_na > 0:
                      df_processed[col].fillna(0, inplace=True)
                      # print(f"    Filled {n_remaining_na} remaining NAs with 0 in '{col}'.")
                      imputed_count_zero += n_remaining_na
        else:
             print(f"Warning: Feature column {col} not found during imputation.")
    df_processed.drop(columns=['month_period'], inplace=True)
    print(f"Imputation complete. Filled {imputed_count_zero} values with 0 after median imputation.")

    # Final check for NaNs
    nans_in_features = df_processed[feature_cols].isna().sum().sum()
    if nans_in_features > 0:
         print(f"WARNING: {nans_in_features} NaNs still remain in feature columns after imputation!")

    print(f"Preprocessing complete. Final shape: {df_processed.shape}")
    if not df_processed.empty:
        print(f"Final data date range: {df_processed.index.min().strftime('%Y-%m-%d')} to {df_processed.index.max().strftime('%Y-%m-%d')}")
    return df_processed

def calculate_r2_oos(y_true, y_pred):
    """Calculates Out-of-Sample R-squared"""
    numerator = ((y_true - y_pred) ** 2).sum()
    denominator = (y_true ** 2).sum()
    if denominator < 1e-10: return np.nan
    return 1 - (numerator / denominator)

# --- Main Execution ---
print("\n--- Starting OLS-3+H Model Fitting ---")
start_time = time.time()

# ** ASSUMPTION: 'combined_data' DataFrame exists from the previous cell **
if 'combined_data' not in locals() and 'combined_data' not in globals():
     print("ERROR: 'combined_data' DataFrame not found. Please run the data combination cell first.")
     data = None
else:
    # 1. Preprocess the existing combined data (NOW includes numeric conversion)
    data = preprocess_combined_data(combined_data, date_col, ticker_col, target_col, feature_cols)

# 2. Proceed with model fitting if preprocessing was successful
if data is not None and not data.empty:
    all_predictions = []
    all_true_values = []
    test_year_r2 = {}

    available_years = sorted(data.index.year.unique())
    actual_test_years = [y for y in available_years if y >= test_start_year and y <= test_end_year]

    if not actual_test_years:
        print(f"\nERROR: No data available in the specified test period ({test_start_year}-{test_end_year}) after preprocessing. Cannot proceed.")
    else:
        print(f"\nStarting OLS-3+H annual refitting from {min(actual_test_years)} to {max(actual_test_years)}...")
        print(f"Using Training Start Year: {train_start_year}")

        for current_test_year in actual_test_years:
            loop_start_time = time.time()
            print(f"  Processing test year: {current_test_year}")

            current_train_end_year = current_test_year - 1
            if current_train_end_year < train_start_year:
                 print(f"    Skipping year {current_test_year}: Training end year ({current_train_end_year}) is before training start year ({train_start_year}).")
                 continue

            train_mask = (data.index.year >= train_start_year) & (data.index.year <= current_train_end_year)
            test_mask = (data.index.year == current_test_year)
            train_df = data.loc[train_mask]
            test_df = data.loc[test_mask]

            if train_df.empty or test_df.empty:
                print(f"    Skipping year {current_test_year}: Not enough data for train ({len(train_df)}) / test ({len(test_df)}) split.")
                continue

            # Final check for NaNs before scaling (should be handled by preprocessing)
            if train_df[feature_cols].isna().any().any() or test_df[feature_cols].isna().any().any():
                 print(f"    ERROR: NaNs detected in features for year {current_test_year} before scaling, after preprocessing! Skipping.")
                 continue

            X_train = train_df[feature_cols].values
            y_train = train_df[target_col].values
            X_test = test_df[feature_cols].values
            y_test = test_df[target_col].values

            scaler = StandardScaler()
            try:
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)

                 # Check for NaNs again AFTER scaling (can happen with constant cols)
                if np.isnan(X_train_scaled).any() or np.isnan(X_test_scaled).any():
                    print(f"    Warning: NaNs generated during scaling for year {current_test_year}. Check for constant columns in training data.")
                    # Impute NaNs caused by scaling constant columns if necessary
                    # X_train_scaled = np.nan_to_num(X_train_scaled)
                    # X_test_scaled = np.nan_to_num(X_test_scaled)
                    # Or skip the year if this is problematic
                    # continue

            except ValueError as e:
                print(f"    ERROR during scaling for year {current_test_year}: {e}. Skipping year.")
                continue

            huber_model = HuberRegressor(fit_intercept=True, max_iter=300, tol=1e-4)
            try:
                # Check for NaNs before fitting model
                if np.isnan(X_train_scaled).any() or np.isnan(y_train).any():
                     print(f"    ERROR: NaNs detected before fitting model for year {current_test_year}. Skipping.")
                     continue
                if np.isnan(X_test_scaled).any():
                     print(f"    ERROR: NaNs detected in test features before prediction for year {current_test_year}. Skipping.")
                     continue

                huber_model.fit(X_train_scaled, y_train)
                predictions = huber_model.predict(X_test_scaled)

                all_predictions.extend(predictions)
                all_true_values.extend(y_test)
                r2_this_year = calculate_r2_oos(y_test, predictions)
                test_year_r2[current_test_year] = r2_this_year
                print(f"    Year {current_test_year} R2_OOS: {r2_this_year:.4f}. Time: {time.time() - loop_start_time:.2f}s")

            except Exception as e:
                print(f"    ERROR during model fitting/prediction for year {current_test_year}: {e}")


        print("\n--- Overall Results ---")
        if all_true_values:
            overall_r2_oos = calculate_r2_oos(np.array(all_true_values), np.array(all_predictions))
            print(f"Overall Out-of-Sample R-squared (R2_OOS) for period {min(actual_test_years)}-{max(actual_test_years)}: {overall_r2_oos:.4f}")
            print(f"(Note: This result is based on your data range and cannot be directly compared to the paper's 0.16%)")
            print("\nR2_OOS per test year:")
            for year, r2 in sorted(test_year_r2.items()):
                print(f"  {year}: {r2:.4f}")
        else:
            print("\nNo valid predictions were generated for the test period.")

else:
    print("\nData preprocessing failed or resulted in empty DataFrame. Cannot proceed with model fitting.")

end_time = time.time()
print(f"\nTotal execution time for this cell: {end_time - start_time:.2f} seconds")


--- Starting OLS-3+H Model Fitting ---
--- Starting Data Preprocessing ---
Input shape: (336740, 138)
Converting 'date' to datetime...
Date index set.
Attempting to convert feature columns to numeric...
  Found 1 non-numeric value(s) in 'BPR'. Coercing to NaN.
Coerced a total of 1 non-numeric entries to NaN across features.
Found 2196 missing values in 'target'. Dropping rows...
Dropped 2196 rows due to missing 'target'.
Shape after handling target NaNs: (334544, 5)
Applying Median+0 imputation to features (including coerced NaNs)...
Imputation complete. Filled 10078 values with 0 after median imputation.
Preprocessing complete. Final shape: (334544, 5)
Final data date range: 2002-05-01 to 2021-08-01

Starting OLS-3+H annual refitting from 2019 to 2020...
Using Training Start Year: 2002
  Processing test year: 2019
    Year 2019 R2_OOS: -0.0010. Time: 1.41s
  Processing test year: 2020
    Year 2020 R2_OOS: -0.0011. Time: 2.01s

--- Overall Results ---
Overall Out-of-Sample R-squared 