<a href="https://colab.research.google.com/github/Jundula/Neural-networks/blob/main/ARISOY_ABD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Group Project**

## **Step 1: Load Stock Data**

In [57]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression


stock_data_path = "data.csv"  # File uploaded to Colab
stock_data = pd.read_csv(stock_data_path)

# Convert date column to datetime format
stock_data['date'] = pd.to_datetime(stock_data['date'])

# Filter data: Prices > $1, Exchange codes 1, 2, or 3, Share codes 10 or 11
filtered_data = stock_data[
    (stock_data['PRC'].abs() > 1) &
    (stock_data['EXCHCD'].isin([1, 2, 3])) &
    (stock_data['SHRCD'].isin([10, 11]))
]

# Calculate market equity (ME)
filtered_data['ME'] = filtered_data['PRC'].abs() * filtered_data['SHROUT']

# Sort by PERMNO and date
filtered_data = filtered_data.sort_values(by=['PERMNO', 'date'])

##**Step 2: Load Fama-French Factors**

In [58]:
ff_factors_path = "F-F_Research_Data_Factors.csv"  # File uploaded to Colab

# Skip the first 3 rows (descriptive text) and use row 4 as the header
ff_factors = pd.read_csv(ff_factors_path, skiprows=3)

# Assign a label to the first column (date column)
ff_factors.columns = ['date'] + list(ff_factors.columns[1:])

# Drop rows where the 'date' column is NaN
ff_factors = ff_factors.dropna(subset=['date'])

# Filter out rows where the 'date' column does not match the YYYYMM format
ff_factors = ff_factors[ff_factors['date'].str.isdigit() & (ff_factors['date'].str.len() == 6)]

# Convert date column to datetime format
ff_factors['date'] = pd.to_datetime(ff_factors['date'], format='%Y%m')

# Rename columns for consistency
ff_factors.rename(columns={'Mkt-RF': 'mkt_excess_return'}, inplace=True)

# Ensure the dates align with the stock data
ff_factors = ff_factors.set_index('date').resample('ME').last().reset_index()

##**Step 3: Load q-Factor Data**

In [59]:
q_factors_path = "q5_factors_monthly_2023.csv"  # File uploaded to Colab
q_factors = pd.read_csv(q_factors_path)

# Combine year and month into a single date column
q_factors['date'] = pd.to_datetime(q_factors['year'].astype(str) + '-' + q_factors['month'].astype(str) + '-01')

# Rename columns for consistency (Replace 'R_ME' with your assigned factor)
q_factors.rename(columns={'R_MKT': 'mkt_excess_return', 'R_ME': 'allocated_factor'}, inplace=True)

# Drop unnecessary columns (year and month), but keep 'date', 'mkt_excess_return', and 'allocated_factor'
q_factors = q_factors[['date', 'mkt_excess_return', 'allocated_factor']]

# Ensure the dates align with the stock data
q_factors = q_factors.set_index('date').resample('ME').last().reset_index()

##**Step 4: Merge Stock Data with Factors**

In [60]:
merged_data = pd.merge(filtered_data, ff_factors, on='date', how='inner')
merged_data = pd.merge(merged_data, q_factors, on='date', how='inner')

# Resolve duplicate columns if necessary
if 'mkt_excess_return_x' in merged_data.columns and 'mkt_excess_return_y' in merged_data.columns:
    merged_data['mkt_excess_return'] = merged_data['mkt_excess_return_x']
    merged_data.drop(columns=['mkt_excess_return_x', 'mkt_excess_return_y'], inplace=True)

# Debug: Print columns after merging
print("Merged Data Columns:", merged_data.columns)

# Ensure RET and RF columns are numeric
merged_data['RET'] = pd.to_numeric(merged_data['RET'], errors='coerce')
merged_data['RF'] = pd.to_numeric(merged_data['RF'], errors='coerce')

# Calculate excess returns for each stock
merged_data['excess_return'] = merged_data['RET'] - merged_data['RF']

Merged Data Columns: Index(['PERMNO', 'date', 'SHRCD', 'EXCHCD', 'PRC', 'RET', 'SHROUT', 'ME',
       'SMB', 'HML', 'RF', 'allocated_factor', 'mkt_excess_return'],
      dtype='object')


##**Step 5: Create a Smaller Subset for Testing**

In [61]:
# Limit the date range to a shorter period (e.g., 1973–1983)
subset_data = merged_data

# Randomly select a subset of PERMNOs (e.g., 100 stocks)
unique_permnos = subset_data['PERMNO'].unique()
sample_permnos = np.random.choice(unique_permnos, size=100, replace=False)  # Randomly select 100 PERMNOs
subset_data = subset_data[subset_data['PERMNO'].isin(sample_permnos)]

# Verify the subset
print(f"Subset Data Shape: {subset_data.shape}")
print(f"Unique PERMNOs in Subset: {len(subset_data['PERMNO'].unique())}")

Subset Data Shape: (9209, 14)
Unique PERMNOs in Subset: 100


##**Step 6: Rolling Regression Function (Using TensorFlow for GPU Acceleration)**

In [62]:
def rolling_regression_tensorflow(data, window=60):  # Use full window size for testing
    betas_mkt = []
    betas_allocated = []

    for i in range(window, len(data)):
        # Extract the rolling window
        window_data = data.iloc[i-window:i]

        # Drop rows with NaN values in any of the required columns
        window_data = window_data.dropna(subset=['mkt_excess_return', 'allocated_factor', 'excess_return'])

        # Skip if there are fewer than 2 observations (minimum required for regression)
        if len(window_data) < 2:
            betas_mkt.append(np.nan)
            betas_allocated.append(np.nan)
            continue

        # Define independent variables (MKT and Allocated Factor) and dependent variable (excess return)
        X = window_data[['mkt_excess_return', 'allocated_factor']].values
        y = window_data['excess_return'].values

        try:
            # Convert to TensorFlow tensors and move to GPU
            X_tf = tf.convert_to_tensor(X, dtype=tf.float32)
            y_tf = tf.convert_to_tensor(y, dtype=tf.float32)

            # Reshape y_tf to be a 2D array with shape (n_samples, 1)
            y_tf = tf.reshape(y_tf, (-1, 1))

            # Add a column of ones for the intercept term
            X_tf = tf.concat([tf.ones((X_tf.shape[0], 1), dtype=tf.float32), X_tf], axis=1)

            # Compute coefficients using the normal equation: β = (X^T X)^(-1) X^T y
            beta = tf.linalg.inv(tf.transpose(X_tf) @ X_tf) @ tf.transpose(X_tf) @ y_tf

            # Save the betas (skip the intercept term)
            betas_mkt.append(float(beta[1].numpy().item()))  # Ensure scalar value for MKT
            betas_allocated.append(float(beta[2].numpy().item()))  # Ensure scalar value for Allocated Factor
        except Exception as e:
            print(f"Error in regression at index {i}: {e}")
            betas_mkt.append(np.nan)
            betas_allocated.append(np.nan)

    # Return betas starting from the 61st month (window + 1)
    return pd.DataFrame({
        'date': data['date'][window:],
        'beta_mkt': betas_mkt,
        'beta_allocated': betas_allocated
    })


##**Step 7: Apply Rolling Regression to Each Stock in the Subset**

In [63]:
grouped = subset_data.groupby('PERMNO')
results_subset = grouped.apply(lambda x: rolling_regression_tensorflow(x, window=60))  # Use full window size

# Reset index to flatten the results
results_subset = results_subset.reset_index(level=0).rename(columns={'level_0': 'PERMNO'})

# Inspect the results
print("Subset Results Shape:", results_subset.shape)
print(results_subset.head())

KeyboardInterrupt: 

##**Step 8: Portfolio Formation (Monthly Deciles)**

In [None]:
# Step 8: Portfolio Formation (Monthly Deciles)
def form_portfolios(data, beta_column, weight_column=None):
    """
    Forms 10 decile portfolios based on beta values.
    If weight_column is None, portfolios are equally-weighted.
    Otherwise, portfolios are value-weighted using the weight_column.
    """
    # Ensure beta_column contains valid scalar values
    data[beta_column] = pd.to_numeric(data[beta_column], errors='coerce')

    # Check if there are enough unique beta values for decile ranking
    unique_betas = data[beta_column].dropna().unique()
    if len(unique_betas) < 10:
        print(f"Not enough unique beta values ({len(unique_betas)}) to form 10 deciles.")
        return pd.Series({i: np.nan for i in range(10)})

    # Rank stocks into deciles based on beta values
    try:
        data['decile'] = pd.qcut(data[beta_column], q=10, labels=False)
    except ValueError as e:
        print(f"Error in pd.qcut: {e}")
        return pd.Series({i: np.nan for i in range(10)})

    # Group by decile and calculate portfolio returns
    if weight_column:
        # Value-weighted returns
        portfolio_returns = data.groupby('decile', group_keys=False)[['RET', weight_column]].apply(
            lambda x: np.average(x['RET'], weights=x[weight_column])
        )
    else:
        # Equally-weighted returns
        portfolio_returns = data.groupby('decile', group_keys=False)['RET'].mean()

    return portfolio_returns

##**Step 9: Monthly Portfolio Returns**

In [None]:
# Merge betas with stock data
portfolio_data_subset = pd.merge(results_subset, subset_data, on=['PERMNO', 'date'], how='inner')

# Form portfolios monthly
monthly_portfolios_equal_weight = portfolio_data_subset.groupby('date').apply(
    lambda x: form_portfolios(x, beta_column='beta_allocated', weight_column=None)
)

monthly_portfolios_value_weight = portfolio_data_subset.groupby('date').apply(
    lambda x: form_portfolios(x, beta_column='beta_allocated', weight_column='ME')
)

# Inspect the portfolio returns
print("Monthly Portfolios (Equally-Weighted):")
print(monthly_portfolios_equal_weight.head())

print("Monthly Portfolios (Value-Weighted):")
print(monthly_portfolios_value_weight.head())

##**Step 10: Arbitrage Portfolio**

In [None]:
# Step 10: Arbitrage Portfolio (Long Portfolio 9, Short Portfolio 0)
arbitrage_portfolio_equal_weight = monthly_portfolios_equal_weight[9] - monthly_portfolios_equal_weight[0]
arbitrage_portfolio_value_weight = monthly_portfolios_value_weight[9] - monthly_portfolios_value_weight[0]

# Save arbitrage portfolio returns to CSV files
arbitrage_portfolio_equal_weight.to_csv("arbitrage_portfolio_equal_weight.csv")
arbitrage_portfolio_value_weight.to_csv("arbitrage_portfolio_value_weight.csv")

##**Step 11: Performance Metrics**

In [None]:
# Step 11: Performance Metrics
def calculate_metrics(returns):
    avg_return = returns.mean()
    std_return = returns.std()
    sharpe_ratio = avg_return / std_return
    t_stat = avg_return / (std_return / np.sqrt(len(returns)))
    return pd.Series({
        'Average Return': avg_return,
        'Standard Deviation': std_return,
        'Sharpe Ratio': sharpe_ratio,
        't-statistic': t_stat
    })

# Calculate metrics for equally-weighted arbitrage portfolio
if arbitrage_portfolio_equal_weight.notna().sum() > 0:
    metrics_equal_weight = calculate_metrics(arbitrage_portfolio_equal_weight)
    print("Metrics for Equally-Weighted Arbitrage Portfolio:")
    print(metrics_equal_weight)
else:
    print("Equally-weighted arbitrage portfolio contains no valid returns.")
    metrics_equal_weight = pd.Series({
        'Average Return': np.nan,
        'Standard Deviation': np.nan,
        'Sharpe Ratio': np.nan,
        't-statistic': np.nan
    })

metrics_equal_weight.to_csv("arbitrage_portfolio_metrics_equal_weight.csv")

# Calculate metrics for value-weighted arbitrage portfolio
if arbitrage_portfolio_value_weight.notna().sum() > 0:
    metrics_value_weight = calculate_metrics(arbitrage_portfolio_value_weight)
    print("Metrics for Value-Weighted Arbitrage Portfolio:")
    print(metrics_value_weight)
else:
    print("Value-weighted arbitrage portfolio contains no valid returns.")
    metrics_value_weight = pd.Series({
        'Average Return': np.nan,
        'Standard Deviation': np.nan,
        'Sharpe Ratio': np.nan,
        't-statistic': np.nan
    })

metrics_value_weight.to_csv("arbitrage_portfolio_metrics_value_weight.csv")



##**Step 12: CAPM Alpha Calculation**

In [None]:
def calculate_capm_alpha(portfolio_returns, market_returns):
    """
    Calculate CAPM alpha and beta using linear regression.
    """
    X = market_returns.values.reshape(-1, 1)  # Independent variable (MKT)
    y = portfolio_returns.values               # Dependent variable (portfolio returns)

    model = LinearRegression().fit(X, y)
    alpha = model.intercept_                   # CAPM alpha
    beta = model.coef_[0]                      # CAPM beta
    residuals = y - model.predict(X)           # Residuals
    alpha_std_error = np.std(residuals) / np.sqrt(len(residuals))  # Standard error of alpha
    t_statistic = alpha / alpha_std_error      # t-statistic for alpha

    return pd.Series({
        'CAPM Alpha': alpha,
        'CAPM Beta': beta,
        't-statistic': t_statistic
    })

# Merge portfolio returns with Fama-French factors
monthly_portfolios_equal_weight = monthly_portfolios_equal_weight.reset_index()
monthly_portfolios_value_weight = monthly_portfolios_value_weight.reset_index()

portfolio_returns_with_factors = pd.merge(
    monthly_portfolios_equal_weight.melt(id_vars='date', var_name='decile', value_name='return'),
    ff_factors[['date', 'mkt_excess_return']],
    on='date',
    how='inner'
)


In [None]:
# Calculate CAPM alpha for all 10 portfolios (equally-weighted)
capm_results_equal_weight = portfolio_returns_with_factors.groupby('decile').apply(
    lambda x: calculate_capm_alpha(x['return'], x['mkt_excess_return'])
)

print("CAPM Results for Equally-Weighted Portfolios:")
print(capm_results_equal_weight)

In [None]:
# Calculate CAPM alpha for all 10 portfolios (value-weighted)
portfolio_returns_with_factors_value_weight = pd.merge(
    monthly_portfolios_value_weight.melt(id_vars='date', var_name='decile', value_name='return'),
    ff_factors[['date', 'mkt_excess_return']],
    on='date',
    how='inner'
)

capm_results_value_weight = portfolio_returns_with_factors_value_weight.groupby('decile').apply(
    lambda x: calculate_capm_alpha(x['return'], x['mkt_excess_return'])
)

print("CAPM Results for Value-Weighted Portfolios:")
print(capm_results_value_weight)

##**Step 13: CAPM Alpha and t-Statistics for Arbitrage**

In [None]:
# Merge arbitrage portfolio returns with Fama-French factors
arbitrage_portfolio_with_factors_equal_weight = pd.merge(
    arbitrage_portfolio_equal_weight.rename('return').reset_index(),
    ff_factors[['date', 'mkt_excess_return']],
    on='date',
    how='inner'
)

arbitrage_portfolio_with_factors_value_weight = pd.merge(
    arbitrage_portfolio_value_weight.rename('return').reset_index(),
    ff_factors[['date', 'mkt_excess_return']],
    on='date',
    how='inner'
)

# Calculate CAPM alpha for equally-weighted arbitrage portfolio
capm_arbitrage_equal_weight = calculate_capm_alpha(
    arbitrage_portfolio_with_factors_equal_weight['return'],
    arbitrage_portfolio_with_factors_equal_weight['mkt_excess_return']
)

print("CAPM Results for Equally-Weighted Arbitrage Portfolio:")
print(capm_arbitrage_equal_weight)



In [None]:
# Calculate CAPM alpha for value-weighted arbitrage portfolio
capm_arbitrage_value_weight = calculate_capm_alpha(
    arbitrage_portfolio_with_factors_value_weight['return'],
    arbitrage_portfolio_with_factors_value_weight['mkt_excess_return']
)

print("CAPM Results for Value-Weighted Arbitrage Portfolio:")
print(capm_arbitrage_value_weight)

##**Step 14: Report Results**

In [None]:
# Average raw returns for equally-weighted portfolios
average_raw_returns_equal_weight = monthly_portfolios_equal_weight.mean()

# Average raw returns for value-weighted portfolios
average_raw_returns_value_weight = monthly_portfolios_value_weight.mean()

print("Average Raw Returns for Equally-Weighted Portfolios:")
print(average_raw_returns_equal_weight)

print("Average Raw Returns for Value-Weighted Portfolios:")
print(average_raw_returns_value_weight)

In [None]:
# Combine CAPM results and average raw returns for equally-weighted portfolios
results_equal_weight = pd.concat(
    [average_raw_returns_equal_weight.rename('Average Raw Return'), capm_results_equal_weight],
    axis=1
)

# Combine CAPM results and average raw returns for value-weighted portfolios
results_value_weight = pd.concat(
    [average_raw_returns_value_weight.rename('Average Raw Return'), capm_results_value_weight],
    axis=1
)

print("Results for Equally-Weighted Portfolios:")
print(results_equal_weight)

print("Results for Value-Weighted Portfolios:")
print(results_value_weight)

# Save results for equally-weighted portfolios
results_equal_weight.to_csv("portfolio_results_equal_weight.csv")

# Save results for value-weighted portfolios
results_value_weight.to_csv("portfolio_results_value_weight.csv")

# Save CAPM results for arbitrage portfolios
capm_arbitrage_equal_weight.to_csv("capm_arbitrage_equal_weight.csv")
capm_arbitrage_value_weight.to_csv("capm_arbitrage_value_weight.csv")

##**Step 15: Graphs and Supplementary Tables**

In [None]:
import matplotlib.pyplot as plt

# Plot equally-weighted portfolio returns
plt.figure(figsize=(12, 6))
for decile in range(10):
    plt.plot(monthly_portfolios_equal_weight['date'], monthly_portfolios_equal_weight[decile], label=f"Portfolio {decile}")
plt.title("Equally-Weighted Portfolio Returns")
plt.xlabel("Date")
plt.ylabel("Return")
plt.legend()
plt.show()


In [None]:
# Plot value-weighted portfolio returns
plt.figure(figsize=(12, 6))
for decile in range(10):
    plt.plot(monthly_portfolios_value_weight['date'], monthly_portfolios_value_weight[decile], label=f"Portfolio {decile}")
plt.title("Value-Weighted Portfolio Returns")
plt.xlabel("Date")
plt.ylabel("Return")
plt.legend()
plt.show()

In [None]:
# Histogram for equally-weighted arbitrage portfolio
plt.figure(figsize=(8, 6))
plt.hist(arbitrage_portfolio_equal_weight.dropna(), bins=30, color='blue', alpha=0.7)
plt.title("Histogram of Equally-Weighted Arbitrage Portfolio Returns")
plt.xlabel("Return")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Histogram for value-weighted arbitrage portfolio
plt.figure(figsize=(8, 6))
plt.hist(arbitrage_portfolio_value_weight.dropna(), bins=30, color='green', alpha=0.7)
plt.title("Histogram of Value-Weighted Arbitrage Portfolio Returns")
plt.xlabel("Return")
plt.ylabel("Frequency")
plt.show()