In [1]:
%conda install pandas
%conda install statsmodels
%conda install scikit-learn

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.7.4
  latest version: 23.11.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.11.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.7.4
  latest version: 23.11.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.11.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.7.4
  latest version

In [61]:
import pandas as pd
industries_data = pd.read_csv('/Users/jingzhao/Desktop/FE PW1/Data/17 Industry Portfolios.CSV')
financial_data = pd.read_csv('/Users/jingzhao/Desktop/FE PW1/Data/Financial Data.CSV')

In [62]:
# HML: Lasso coefficient
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

merged_data = pd.merge(industries_data, financial_data[['Date', 'HML']], on='Date')
X = merged_data.drop(columns=['Date', 'HML'])
y = merged_data['HML']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Using LASSO regression with cross-validation to find the best alpha
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Extracting the coefficients and the selected features
lasso_coefs = lasso.coef_
selected_features = X.columns[np.abs(lasso_coefs) > 0]

selected_features, mse, lasso.alpha_

# Creating a DataFrame to display the coefficients and their corresponding features
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso_coefs
})

# Filtering out the features with zero coefficients
significant_coefficients = coefficients[np.abs(coefficients['Coefficient']) > 0]
significant_coefficients.sort_values(by='Coefficient', ascending=False)


Unnamed: 0,Feature,Coefficient
12,Trans,1.69793
8,Steel,1.63172
15,Finan,1.592452
4,Durbl,0.792943
2,Oil,0.557602
13,Utils,0.554351
11,Cars,0.486595
3,Clths,0.475323
0,Food,0.252767
7,Cnstr,-0.031482


In [63]:
# Optimal alpha
optimal_alpha = lasso.alpha_
optimal_alpha

0.0034997229923687725

In [64]:
# Best regressors
top_5_features = significant_coefficients.reindex(significant_coefficients.Coefficient.abs().sort_values(ascending=False).index).head(5)
top_5_features

Unnamed: 0,Feature,Coefficient
16,Other,-2.065798
10,Machn,-1.819663
12,Trans,1.69793
8,Steel,1.63172
15,Finan,1.592452


In [65]:
# HML Rolling Regression
from datetime import timedelta
import pandas as pd
from sklearn.linear_model import LinearRegression

def rolling_regression(data, selected_features, train_years, test_years):
    # Convert 'Date' to datetime for easier date manipulation
    data['Date'] = pd.to_datetime(data['Date'], format='%Y%m')

    # Get the unique years in the dataset
    unique_years = data['Date'].dt.year.unique()
    
    results = []
    start_year = unique_years[0]

    # Perform rolling regression
    while start_year + train_years + test_years <= unique_years[-1]:
        # Define training and testing periods
        train_start = pd.Timestamp(year=start_year, month=1, day=1)
        train_end = train_start + pd.DateOffset(years=train_years)
        test_end = train_end + pd.DateOffset(years=test_years)

        # Subset the data for training and testing
        train_data = data[(data['Date'] >= train_start) & (data['Date'] < train_end)]
        test_data = data[(data['Date'] >= train_end) & (data['Date'] < test_end)]

        # Fit the model
        X_train = train_data[selected_features]
        y_train = train_data['HML']
        model = LinearRegression().fit(X_train, y_train)

        # Predict on test data
        X_test = test_data[selected_features]
        y_pred = model.predict(X_test)

        # Collect coefficients and predictions
        coefficients = model.coef_
        results.append({
            'train_start': train_start,
            'train_end': train_end,
            'test_end': test_end,
            'coefficients': coefficients,
            'predicted_HML': y_pred
        })

        # Move to the next period
        start_year += test_years

    return pd.DataFrame(results)

# Define the selected features (this should be set based on your specific dataset)
selected_features = ['Trans', 'Steel', 'Machn', 'Finan', 'Other']

# Rolling regression for each scenario
results_5_year = rolling_regression(merged_data, selected_features, train_years=5, test_years=5)
results_10_year = rolling_regression(merged_data, selected_features, train_years=10, test_years=5)
results_20_year = rolling_regression(merged_data, selected_features, train_years=20, test_years=5)

In [66]:
# Beta and HML
def extract_betas_and_predictions(results):
    betas = []
    predicted_HML = []

    for index, row in results.iterrows():
        # Extract beta coefficients
        betas.append({
            'train_start': row['train_start'],
            'train_end': row['train_end'],
            'test_end': row['test_end'],
            **{f'beta_{i}': coef for i, coef in enumerate(row['coefficients'])}
        })
        
        # Extract predicted HML values
        for prediction in row['predicted_HML']:
            predicted_HML.append({
                'test_end': row['test_end'],
                'predicted_HML': prediction
            })

    betas_df = pd.DataFrame(betas)
    predicted_HML_df = pd.DataFrame(predicted_HML)

    return betas_df, predicted_HML_df

# Process the results for each time scheme
betas_5_year, predicted_HML_5_year = extract_betas_and_predictions(results_5_year)
betas_10_year, predicted_HML_10_year = extract_betas_and_predictions(results_10_year)
betas_20_year, predicted_HML_20_year = extract_betas_and_predictions(results_20_year)

In [67]:
# Residual
def calculate_residuals(data, predicted_HML, train_years, test_years):
    data['Date'] = pd.to_datetime(data['Date'], format='%Y%m')
    unique_years = data['Date'].dt.year.unique()
    residuals = []

    start_year = unique_years[0]
    predicted_index = 0

    while start_year + train_years + test_years <= unique_years[-1]:
        # Define the testing period
        test_start = pd.Timestamp(year=start_year + train_years, month=1, day=1)
        test_end = test_start + timedelta(days=365 * test_years)

        # Subset the actual data for the testing period
        test_data = data[(data['Date'] >= test_start) & (data['Date'] < test_end)]
        actual_HML = test_data['HML'].values

        # Calculate residuals
        predicted_HML_values = predicted_HML['predicted_HML'].iloc[predicted_index:predicted_index + len(actual_HML)]
        residual = actual_HML - predicted_HML_values
        residuals.extend(residual)

        # Update indices
        predicted_index += len(actual_HML)
        start_year += test_years

    return pd.DataFrame({'residuals': residuals})

# Calculate residuals for each rolling scheme
residuals_5_year = calculate_residuals(merged_data, predicted_HML_5_year, 5, 5)
residuals_10_year = calculate_residuals(merged_data, predicted_HML_10_year, 10, 5)
residuals_20_year = calculate_residuals(merged_data, predicted_HML_20_year, 20, 5)

In [68]:
# Residual Summary
residual_summary_5_year = residuals_5_year.describe()
residual_summary_10_year = residuals_10_year.describe()
residual_summary_20_year = residuals_20_year.describe()

residual_summary_5_year, residual_summary_10_year, residual_summary_20_year

(         residuals
 count  1080.000000
 mean     -0.031597
 std       2.475316
 min     -13.710342
 25%      -1.332233
 50%      -0.075307
 75%       1.360415
 max      10.027822,
          residuals
 count  1020.000000
 mean     -0.098707
 std       2.148893
 min     -13.030419
 25%      -1.326080
 50%      -0.103802
 75%       1.141364
 max       9.186277,
         residuals
 count  900.000000
 mean    -0.130205
 std      2.086146
 min     -8.466385
 25%     -1.372565
 50%     -0.163566
 75%      1.103427
 max      8.381515)

In [69]:
# SSE for each time scheme
sse_5_year = np.sum(residuals_5_year['residuals'] ** 2)
sse_10_year = np.sum(residuals_10_year['residuals'] ** 2)
sse_20_year = np.sum(residuals_20_year['residuals'] ** 2)

sse_5_year, sse_10_year, sse_20_year

(6612.31704619941, 4715.415162630034, 3927.7100470840132)

In [70]:
# out-of-sample R-squared
def calculate_out_of_sample_r_squared(data, predicted_HML, residuals, train_years, test_years):
    data['Date'] = pd.to_datetime(data['Date'], format='%Y%m')
    unique_years = data['Date'].dt.year.unique()

    total_sum_squares = 0
    residual_sum_squares = 0
    predicted_index = 0

    start_year = unique_years[0]

    while start_year + train_years + test_years <= unique_years[-1]:
        # Define the testing period
        test_start = pd.Timestamp(year=start_year + train_years, month=1, day=1)
        test_end = test_start + timedelta(days=365 * test_years)

        # Subset the actual data for the testing period
        test_data = data[(data['Date'] >= test_start) & (data['Date'] < test_end)]
        actual_HML = test_data['HML'].values

        # Calculate total sum of squares and residual sum of squares
        mean_actual_HML = np.mean(actual_HML)
        total_sum_squares += np.sum((actual_HML - mean_actual_HML) ** 2)
        residual_sum_squares += np.sum(residuals['residuals'].iloc[predicted_index:predicted_index + len(actual_HML)] ** 2)

        # Update indices
        predicted_index += len(actual_HML)
        start_year += test_years

    return 1 - (residual_sum_squares / total_sum_squares)

# Calculate out-of-sample R-squared for each rolling scheme
r_squared_5_year = calculate_out_of_sample_r_squared(merged_data, predicted_HML_5_year, residuals_5_year, 5, 5)
r_squared_10_year = calculate_out_of_sample_r_squared(merged_data, predicted_HML_10_year, residuals_10_year, 10, 5)
r_squared_20_year = calculate_out_of_sample_r_squared(merged_data, predicted_HML_20_year, residuals_20_year, 20, 5)

r_squared_5_year, r_squared_10_year, r_squared_20_year

(0.4973857240612507, 0.42530172773361963, 0.4024258908893674)

In [71]:
# MOM Select Regressor
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# Merging the datasets on the 'Date' column
merged_data = pd.merge(industries_data, financial_data[['Date', 'MOM']], on='Date')

# Splitting the data into features (X) and target (y)
X = merged_data.drop(columns=['Date', 'MOM'])
y = merged_data['MOM']

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Using LASSO regression with cross-validation to find the best alpha
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Extracting the coefficients and the selected features
lasso_coefs = lasso.coef_
selected_features = X.columns[np.abs(lasso_coefs) > 0]

selected_features, mse, lasso.alpha_

# Creating a DataFrame to display the coefficients and their corresponding features
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso_coefs
})

# Filtering out the features with zero coefficients
significant_coefficients = coefficients[np.abs(coefficients['Coefficient']) > 0]
significant_coefficients.sort_values(by='Coefficient', ascending=False)

top_5_features = significant_coefficients.reindex(significant_coefficients.Coefficient.abs().sort_values(ascending=False).index).head(5)
top_5_features

Unnamed: 0,Feature,Coefficient
15,Finan,-1.873595
8,Steel,-1.302492
16,Other,1.178256
3,Clths,-0.881954
7,Cnstr,0.806181


In [72]:
# MOM Rolling Regression
from datetime import timedelta
import pandas as pd
from sklearn.linear_model import LinearRegression

def rolling_regression(data, selected_features, train_years, test_years):
    # Convert 'Date' to datetime for easier date manipulation
    data['Date'] = pd.to_datetime(data['Date'], format='%Y%m')

    # Get the unique years in the dataset
    unique_years = data['Date'].dt.year.unique()
    
    results = []
    start_year = unique_years[0]

    # Perform rolling regression
    while start_year + train_years + test_years <= unique_years[-1]:
        # Define training and testing periods
        train_start = pd.Timestamp(year=start_year, month=1, day=1)
        train_end = train_start + pd.DateOffset(years=train_years)
        test_end = train_end + pd.DateOffset(years=test_years)

        # Subset the data for training and testing
        train_data = data[(data['Date'] >= train_start) & (data['Date'] < train_end)]
        test_data = data[(data['Date'] >= train_end) & (data['Date'] < test_end)]

        # Fit the model
        X_train = train_data[selected_features]
        y_train = train_data['MOM']
        model = LinearRegression().fit(X_train, y_train)

        # Predict on test data
        X_test = test_data[selected_features]
        y_pred = model.predict(X_test)

        # Collect coefficients and predictions
        coefficients = model.coef_
        results.append({
            'train_start': train_start,
            'train_end': train_end,
            'test_end': test_end,
            'coefficients': coefficients,
            'predicted_MOM': y_pred
        })

        # Move to the next period
        start_year += test_years

    return pd.DataFrame(results)

# Define the selected features (this should be set based on your specific dataset)
selected_features = ['Clths', 'Steel', 'Cnstr', 'Finan', 'Other']

# Rolling regression for each scenario
results_5_year = rolling_regression(merged_data, selected_features, train_years=5, test_years=5)
results_10_year = rolling_regression(merged_data, selected_features, train_years=10, test_years=5)
results_20_year = rolling_regression(merged_data, selected_features, train_years=20, test_years=5)

In [73]:
# Beta and HML
def extract_betas_and_predictions(results):
    betas = []
    predicted_MOM = []

    for index, row in results.iterrows():
        # Extract beta coefficients
        betas.append({
            'train_start': row['train_start'],
            'train_end': row['train_end'],
            'test_end': row['test_end'],
            **{f'beta_{i}': coef for i, coef in enumerate(row['coefficients'])}
        })
        
        # Extract predicted HML values
        for prediction in row['predicted_MOM']:
            predicted_MOM.append({
                'test_end': row['test_end'],
                'predicted_MOM': prediction
            })

    betas_df = pd.DataFrame(betas)
    predicted_MOM_df = pd.DataFrame(predicted_MOM)

    return betas_df, predicted_MOM_df

# Process the results for each time scheme
betas_5_year, predicted_MOM_5_year = extract_betas_and_predictions(results_5_year)
betas_10_year, predicted_MOM_10_year = extract_betas_and_predictions(results_10_year)
betas_20_year, predicted_MOM_20_year = extract_betas_and_predictions(results_20_year)

In [74]:
# Residual
def calculate_residuals(data, predicted_MOM, train_years, test_years):
    data['Date'] = pd.to_datetime(data['Date'], format='%Y%m')
    unique_years = data['Date'].dt.year.unique()
    residuals = []

    start_year = unique_years[0]
    predicted_index = 0

    while start_year + train_years + test_years <= unique_years[-1]:
        # Define the testing period
        test_start = pd.Timestamp(year=start_year + train_years, month=1, day=1)
        test_end = test_start + timedelta(days=365 * test_years)

        # Subset the actual data for the testing period
        test_data = data[(data['Date'] >= test_start) & (data['Date'] < test_end)]
        actual_MOM = test_data['MOM'].values

        # Calculate residuals
        predicted_MOM_values = predicted_MOM['predicted_MOM'].iloc[predicted_index:predicted_index + len(actual_MOM)]
        residual = actual_MOM - predicted_MOM_values
        residuals.extend(residual)

        # Update indices
        predicted_index += len(actual_MOM)
        start_year += test_years

    return pd.DataFrame({'residuals': residuals})

# Calculate residuals for each rolling scheme
residuals_5_year = calculate_residuals(merged_data, predicted_MOM_5_year, 5, 5)
residuals_10_year = calculate_residuals(merged_data, predicted_MOM_10_year, 10, 5)
residuals_20_year = calculate_residuals(merged_data, predicted_MOM_20_year, 20, 5)

In [75]:
# Residual summaries
residual_summary_5_year = residuals_5_year.describe()
residual_summary_10_year = residuals_10_year.describe()
residual_summary_20_year = residuals_20_year.describe()

residual_summary_5_year, residual_summary_10_year, residual_summary_20_year

(         residuals
 count  1080.000000
 mean      0.215367
 std       4.460510
 min     -33.975842
 25%      -1.655809
 50%       0.401647
 75%       2.748763
 max      18.215095,
          residuals
 count  1020.000000
 mean      0.176491
 std       4.087025
 min     -30.014933
 25%      -1.593167
 50%       0.298243
 75%       2.525891
 max      14.201794,
         residuals
 count  900.000000
 mean     0.094183
 std      3.936129
 min    -30.209998
 25%     -1.590653
 50%      0.195849
 75%      2.341193
 max     14.534968)

In [76]:
# SSE for each time scheme
sse_5_year = np.sum(residuals_5_year['residuals'] ** 2)
sse_10_year = np.sum(residuals_10_year['residuals'] ** 2)
sse_20_year = np.sum(residuals_20_year['residuals'] ** 2)

sse_5_year, sse_10_year, sse_20_year

(21518.039195379984, 17052.92026996847, 13936.287692297024)

In [77]:
# Out-of-sample R-squared
def calculate_out_of_sample_r_squared(data, predicted_MOM, residuals, train_years, test_years):
    data['Date'] = pd.to_datetime(data['Date'], format='%Y%m')
    unique_years = data['Date'].dt.year.unique()

    total_sum_squares = 0
    residual_sum_squares = 0
    predicted_index = 0

    start_year = unique_years[0]

    while start_year + train_years + test_years <= unique_years[-1]:
        # Define the testing period
        test_start = pd.Timestamp(year=start_year + train_years, month=1, day=1)
        test_end = test_start + timedelta(days=365 * test_years)

        # Subset the actual data for the testing period
        test_data = data[(data['Date'] >= test_start) & (data['Date'] < test_end)]
        actual_MOM = test_data['MOM'].values

        # Calculate total sum of squares and residual sum of squares
        mean_actual_MOM = np.mean(actual_MOM)
        total_sum_squares += np.sum((actual_MOM - mean_actual_MOM) ** 2)
        residual_sum_squares += np.sum(residuals['residuals'].iloc[predicted_index:predicted_index + len(actual_MOM)] ** 2)

        # Update indices
        predicted_index += len(actual_MOM)
        start_year += test_years

    return 1 - (residual_sum_squares / total_sum_squares)

# Calculate out-of-sample R-squared for each rolling scheme
r_squared_5_year = calculate_out_of_sample_r_squared(merged_data, predicted_MOM_5_year, residuals_5_year, 5, 5)
r_squared_10_year = calculate_out_of_sample_r_squared(merged_data, predicted_MOM_10_year, residuals_10_year, 10, 5)
r_squared_20_year = calculate_out_of_sample_r_squared(merged_data, predicted_MOM_20_year, residuals_20_year, 20, 5)

r_squared_5_year, r_squared_10_year, r_squared_20_year

(0.05765979016789835, -0.0396814015954885, -0.038007298136080614)