In [4]:
#Final CODE PARALLEL
import numpy as np
from numpy.linalg import eigh
import time
import matplotlib.pyplot as plt
import math
import pyspark
from pyspark import SparkContext
from pyspark.mllib.linalg import Matrices
from pyspark.mllib.linalg.distributed import BlockMatrix
from scipy import stats


def create_block_matrix_from_numpy(
    np_array: np.ndarray,
    row_block_count: int,
    col_block_count: int,
    sc
    ):
    """
    Create a BlockMatrix from a NumPy array.
    
    Args:
        np_array (np.ndarray): NumPy array to be converted
        row_block_count (int): Number of rows in each block
        col_block_count (int): Number of columns in each block
        sc: SparkContext for parallelization
        
    Returns:
        BlockMatrix: Distributed block matrix representation of input array
    """
    num_rows: int = 0
    num_cols: int = 0
    blocks: list = []
    blocks_rdd: pyspark.RDD = None
    block_dense_matrix: Matrices.dense = None
    
    num_rows, num_cols = np_array.shape

    for j in range(0, num_cols, col_block_count):
        for i in range(0, num_rows, row_block_count):
            block = np_array[i:i + row_block_count, j:j + col_block_count]
            block_dense_matrix = Matrices.dense(block.shape[0], block.shape[1], block.T.flatten())
            blocks.append(((i // row_block_count, j // col_block_count), block_dense_matrix))

    blocks_rdd = sc.parallelize(blocks)
    
    return BlockMatrix(blocks_rdd, row_block_count, col_block_count)
    
def svd(
    gram_mat: pyspark.mllib.linalg.distributed.BlockMatrix,
    row_block_count: int,
    col_block_count: int,
    sc
    ):
    """
    Perform Singular Value Decomposition (SVD) on a Gramian matrix.
    
    Args:
        gram_mat (BlockMatrix): The input Gramian matrix.
        row_block_count (int): Number of rows in each block
        col_block_count (int): Number of columns in each block
    
    Returns:
        tuple[BlockMatrix, BlockMatrix, BlockMatrix]: The U, Sigma-inverse, and V^T matrices as BlockMatrices.
    """
    matrix_size: int = 0
    tol: float = 0.0
    arr: np.ndarray = None
    eigval_V: np.ndarray = None
    eigvecs: np.ndarray = None
    idx: np.ndarray = None
    eigvals_inv: np.ndarray = None
    block_s_inv: BlockMatrix = None
    block_v: BlockMatrix = None
    
    tol = 1e-15

    # Compute and sort eigenvalues and eigenvectors with a numpy Array.
    arr = gram_mat.toLocalMatrix().toArray()
    eigval_V, eigvecs = np.linalg.eigh(arr)
    idx = eigval_V.argsort()[::-1]
    eigval_V = eigval_V[idx]
    eigvecs = eigvecs[:, idx]

    # Invert eigenvalues (set small ones to zero) and form a diagonal matrix.
    eigvals_inv = np.array([1/val if val > tol else 0 for val in eigval_V])
    eigvals_inv = np.diag(eigvals_inv)
    
    # Convert results to block matrices for Spark and transpose the eigenvectors.
    block_s_inv = create_block_matrix_from_numpy(eigvals_inv, int(math.ceil(eigvals_inv.shape[0]/n_workers)), int(math.ceil(eigvals_inv.shape[1]/n_workers)), sc)

    block_v = create_block_matrix_from_numpy(eigvecs, int(math.ceil(eigvecs.shape[0]/n_workers)), int(math.ceil(eigvecs.shape[1]/n_workers)), sc)
    
    return block_v.multiply(block_s_inv).multiply(block_v.transpose())

def betacalc(
    X: pyspark.mllib.linalg.distributed.BlockMatrix,
    Y: pyspark.mllib.linalg.distributed.BlockMatrix,
    row_block_count: int,
    col_block_count: int,
    sc
    ):
    """
    Calculate the beta values for the input matrices X and Y.
    
    Args:
        X (BlockMatrix): The input feature matrix.
        Y (BlockMatrix): The target values matrix.
        row_block_count (int): Number of rows in each block
        col_block_count (int): Number of columns in each block
    
    Returns:
        np.ndarray: The computed beta values.
    """
    XtX: BlockMatrix = None
    XtX_inv: BlockMatrix = None
    beta: BlockMatrix = None

    # Compute gram-matrix
    XtX = X.transpose().multiply(X)

    # Invert XtX using SVD.
    XtX_inv = svd(XtX, row_block_count, col_block_count, sc)

    # Compute and return betas
    beta = XtX_inv.multiply(X.transpose()).multiply(Y)
    return beta

def stat_values(
    X: pyspark.mllib.linalg.distributed.BlockMatrix,
    Y: pyspark.mllib.linalg.distributed.BlockMatrix,
    row_block_count: int,
    col_block_count: int,
    sc
):
    """
    Compute comprehensive linear model statistics
    Returns formatted string output
    
    Args:  
        X (BlockMatrix): The input feature matrix.
        Y (BlockMatrix): The target values matrix.
        row_block_count (int): Number of rows in each block
        col_block_count (int): Number of columns in each block
    Returns:
        str: Formatted string containing OLS statistics
    """
    
    # Calculate beta and convert to local
    
    beta = betacalc(X, Y, row_block_count, col_block_count, sc)
    fitted_values = X.multiply(beta)
    residuals = Y.subtract(fitted_values)
    
    fitted_local = fitted_values.toLocalMatrix().toArray()
    residuals_local = residuals.toLocalMatrix().toArray()
    beta_local = beta.toLocalMatrix().toArray()

    # Get local matrices for calculations
    X_local = X.toLocalMatrix().toArray()
    Y_local = Y.toLocalMatrix().toArray()
    
    # Basic dimensions
    n = X.numRows()
    p = X.numCols()
    degrees_of_freedom = n - p
    
    # Residual statistics
    rss = np.sum(residuals_local ** 2)  # Residual sum of squares
    mse = rss / degrees_of_freedom
    rmse = np.sqrt(mse)
    
    # Calculate XtX and its inverse for standard errors
    XtX = X.transpose().multiply(X)
    XtX_local = XtX.toLocalMatrix().toArray()
    XtX_inv = np.linalg.inv(XtX_local)
    
    # Standard errors and t-values
    beta_std_errors = np.sqrt(np.diag(XtX_inv) * mse)
    t_values = beta_local.flatten() / beta_std_errors
    
    # Calculate p-values for t-statistics
    p_values = 2 * (1 - stats.t.cdf(np.abs(t_values), degrees_of_freedom))
    
    # R-squared statistics
    y_mean = np.mean(Y_local)
    tss = np.sum((Y_local - y_mean) ** 2)  # Total sum of squares
    r_squared = 1 - (rss / tss)
    adj_r_squared = 1 - (1 - r_squared) * ((n - 1) / degrees_of_freedom)
    
    # F-statistic
    model_ss = tss - rss
    f_statistic = (model_ss / (p - 1)) / mse
    f_p_value = 1 - stats.f.cdf(f_statistic, p-1, degrees_of_freedom)
    
    # Residual analysis
    residuals_std = residuals_local / np.sqrt(mse)
    
    # Format output string similar to R's lm()
    output = []
    output.append("Call:")
    output.append("Linear Model Fit\n")
    
    output.append("Residuals:")
    residual_stats = {
        "Min": np.min(residuals_local),
        "1Q": np.percentile(residuals_local, 25),
        "Median": np.median(residuals_local),
        "3Q": np.percentile(residuals_local, 75),
        "Max": np.max(residuals_local)
    }
    output.append("".join(f"{k:>8}" for k in residual_stats.keys()))
    output.append("".join(f"{v:8.4f}" for v in residual_stats.values()))
    output.append("")
    
    # Coefficients table
    output.append("Coefficients:")
    output.append("              Estimate Std. Error t value Pr(>|t|)")
    for i in range(len(beta_local)):
        coef_line = f"Variable{i:2d} {beta_local[i][0]:9.4f} {beta_std_errors[i]:10.4f} {t_values[i]:7.3f} {p_values[i]:8.4f} "
        # Add significance stars like R
        if p_values[i] < 0.001:
            coef_line += "***"
        elif p_values[i] < 0.01:
            coef_line += "** "
        elif p_values[i] < 0.05:
            coef_line += "*  "
        elif p_values[i] < 0.1:
            coef_line += ".  "
        output.append(coef_line)
    output.append("---")
    output.append("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n")
    
    # Model statistics
    output.append(f"Residual standard error: {np.sqrt(mse):.4f} on {degrees_of_freedom} degrees of freedom")
    output.append(f"Multiple R-squared: {r_squared:.4f}, Adjusted R-squared: {adj_r_squared:.4f}")
    output.append(f"F-statistic: {f_statistic:.2f} on {p-1} and {degrees_of_freedom} DF, p-value: {f_p_value:.4e}\n")
    
    # Additional diagnostic information
    output.append("Additional Statistics:")
    output.append(f"AIC: {n * np.log(rss/n) + 2*p:.4f}")
    output.append(f"BIC: {n * np.log(rss/n) + np.log(n)*p:.4f}")
    output.append(f"RMSE: {rmse:.4f}")
    
    return "\n".join(output)


def calculate_statistics(
    X: pyspark.mllib.linalg.distributed.BlockMatrix,
    Y: pyspark.mllib.linalg.distributed.BlockMatrix,
    row_block_count: int,
    col_block_count: int,
    sc
):
    """
    Compute comprehensive linear model statistics with statsmodels-style output
    
    Args:  
        X (BlockMatrix): The input feature matrix.
        Y (BlockMatrix): The target values matrix.
        row_block_count (int): Number of rows in each block
        col_block_count (int): Number of columns in each block
        sc: Spark context
    Returns:
        float: elapsed_time for calculating betas
        str: Formatted string containing OLS statistics in statsmodels style
    """
    
    # Calculate beta and convert to local
    elapsed_time = 0
    start_time = time.time()
    beta = betacalc(X, Y, row_block_count, col_block_count, sc)
    fitted_values = X.multiply(beta)
    residuals = Y.subtract(fitted_values)
    elapsed_time = elapsed_time + (time.time() - start_time)
    
    # Convert to local arrays for calculations
    fitted_local = fitted_values.toLocalMatrix().toArray()
    residuals_local = residuals.toLocalMatrix().toArray()
    beta_local = beta.toLocalMatrix().toArray()
    X_local = X.toLocalMatrix().toArray()
    Y_local = Y.toLocalMatrix().toArray()
    
    # Basic dimensions
    n = X.numRows()
    k = X.numCols()  
    df_model = k 
    df_resid = n - k
    
    # Residual statistics
    ssr = np.sum(residuals_local ** 2) 
    mse = ssr / df_resid
    
    # Calculate XtX and its inverse for standard errors
    XtX = X.transpose().multiply(X)
    XtX_local = XtX.toLocalMatrix().toArray()
    XtX_inv = np.linalg.inv(XtX_local)
    
    # Standard errors and t-values
    bse = np.sqrt(np.diag(XtX_inv) * mse)
    tvalues = beta_local.flatten() / bse
    pvalues = 2 * (1 - stats.t.cdf(np.abs(tvalues), df_resid))
    
    # Uncentered R-squared statistics
    tss_uncentered = np.sum(Y_local ** 2) 
    r2_uncentered = 1 - (ssr / tss_uncentered)
    adj_r2_uncentered = 1 - (1 - r2_uncentered) * (n / df_resid)
    
    # F-test (uncentered)
    ess = tss_uncentered - ssr  
    fvalue = (ess / df_model) / mse
    fpvalue = stats.f.sf(fvalue, df_model, df_resid)
    
    # Log-likelihood and information criteria
    llf = -n/2 * (1 + np.log(2*np.pi) + np.log(ssr/n))
    aic = -2 * llf + 2 * k
    bic = -2 * llf + np.log(n) * k
    
    # Additional diagnostics
    condition_number = np.sqrt(np.max(np.linalg.eigvals(XtX_local)) / 
                             np.min(np.linalg.eigvals(XtX_local)))
    
    # Residual diagnostics with corrected Jarque-Bera test
    residuals_standardized = residuals_local.flatten() / np.sqrt(mse)
    skew = stats.skew(residuals_standardized)
    kurtosis = stats.kurtosis(residuals_standardized, fisher=True)
    
    # Corrected Jarque-Bera calculation
    jb = n/6 * (skew**2 + (kurtosis**2)/4)
    jbpv = stats.chi2.sf(jb, df=2)  # Chi-square with 2 df
    
    # Omnibus test (D'Agostino K^2)
    k2, p_omnibus = stats.normaltest(residuals_standardized)
    
    # Durbin-Watson
    dw = np.sum(np.diff(residuals_local.flatten()) ** 2) / ssr
    
    # Create summary string similar to statsmodels
    summary = []
    summary.append("                            OLS Regression Results                            ")
    summary.append("===========================================================================")
    summary.append(f"Dep. Variable:                      y   R-squared (uncentered):     {r2_uncentered:>7.3f}")
    summary.append(f"Model:                            OLS   Adj. R-squared (uncentered):{adj_r2_uncentered:>7.3f}")
    summary.append(f"Method:                 Least Squares   F-statistic:                {fvalue:>7.3f}")
    summary.append(f"Date:                             now   Prob (F-statistic):         {fpvalue:>7.3f}")
    summary.append(f"Time:                             now   Log-Likelihood:             {llf:>7.3f}")
    summary.append(f"No. Observations:          {n:>10.0f}   AIC:                        {aic:>7.3f}")
    summary.append(f"Df Residuals:              {df_resid:>10.0f}   BIC:                        {bic:>7.3f}")
    summary.append(f"Df Model:                  {df_model:>10.0f}")
    summary.append("===========================================================================")
    summary.append("                coef    std err          t      P>|t|     [0.025     0.975]")
    summary.append("---------------------------------------------------------------------------")
    
    # Parameter estimates with confidence intervals
    conf_int = np.column_stack((
        beta_local.flatten() - stats.t.ppf(0.975, df_resid) * bse,
        beta_local.flatten() + stats.t.ppf(0.975, df_resid) * bse
    ))
    
    for i in range(k):
        summary.append(f"x{i:<8.0f} {beta_local[i][0]:>10.3f} {bse[i]:>10.3f} {tvalues[i]:>10.3f} {pvalues[i]:>10.3f} {conf_int[i,0]:>10.3f} {conf_int[i,1]:>10.3f}")
    
    summary.append("===========================================================================")
    summary.append(f"Omnibus:                   {k2:>10.3f}   Durbin-Watson:           {dw:>10.3f}")
    summary.append(f"Prob(Omnibus):             {p_omnibus:>10.3f}   Jarque-Bera (JB):        {jb:>10.3f}")
    summary.append(f"Skew:                      {skew:>10.3f}   Prob(JB):                {jbpv:>10.3f}")
    summary.append(f"Kurtosis:                  {kurtosis+3:>10.3f}   Cond. No.                {condition_number:>10.3f}")
    summary.append("===========================================================================")
    
    return elapsed_time, "\n".join(summary)

def run_performance_tests(
    n_features_list: list,
    n_rows: int,
    n_times: int,
    n_workers: int,
    sc):
    """
    Run performance tests and optionally visualize results
    
    Args:
        n_features_list (list): List of feature sizes to test
        n_rows (int): Number of rows in the matrices
        n_times (int): Number of test iterations per feature size
        n_workers (int): Number of Spark workers
        sc: Spark context
        
    Returns:
        tuple: Lists of elapsed_times, avg_elapsed_times, times_per_feature
    """
    elapsed_times = []
    avg_elapsed_times = []
    times_per_feature = []
    
    # Run performance tests
    for n_features in n_features_list:
        row_block_count = math.ceil(n_rows/n_workers)
        col_block_count = math.ceil(n_features/n_workers)
        matX, matY = create_random_block_matrix_and_vector(
            n_rows, n_features, row_block_count, col_block_count, sc
        )
        elapsed_time = 0
        
        for a in range(n_times):
            elapsed_time, stat = calculate_statistics(
                matX, matY, row_block_count, col_block_count, sc
            )
            print(stat)
            
        avg_elapsed_time = elapsed_time / n_times
        time_per_feature = avg_elapsed_time / n_features
        
        elapsed_times.append(elapsed_time)
        avg_elapsed_times.append(avg_elapsed_time)
        times_per_feature.append(time_per_feature)
        
        print(f"Features: {n_features}, Gesamtzeit: {elapsed_time:.5f} Sekunden")
        print(f"Features: {n_features}, durchschnittliche Zeit: {avg_elapsed_time:.5f} Sekunden")
        print(f"Features: {n_features}, durchschnittliche Zeit pro feature: {time_per_feature:.5f} Sekunden")
    
    # Visualization
    # Plot 1: Average Time
    fig2, ax2 = plt.subplots(figsize=(12, 6))
    line2 = ax2.plot(n_features_list, avg_elapsed_times, 'r-', marker='s', 
                    label='Durchschnittliche Zeit')
    ax2.set_xlabel('Anzahl der Features')
    ax2.set_ylabel('Durchschnittliche Zeit (Sekunden)', color='r')
    plt.title('Laufzeitanalyse: Durchschnittliche Zeit', fontsize=16)
    ax2.legend(loc='upper left')
    ax2.grid(True, linestyle='--', alpha=0.7)
    
    for x, y in zip(n_features_list, avg_elapsed_times):
        ax2.annotate(f'{y:.2f}s', (x, y), textcoords="offset points", 
                    xytext=(0,10), ha='center', color='r')
    
    plt.tight_layout()
    plt.show()
    
    # Plot 2: Time per Feature
    fig3, ax3 = plt.subplots(figsize=(12, 6))
    line3 = ax3.plot(n_features_list, times_per_feature, 'g-', marker='s', 
                    label='Zeit pro Feature')
    ax3.set_xlabel('Anzahl der Features')
    ax3.set_ylabel('Zeit pro Feature (Sekunden)', color='g')
    plt.title('Laufzeitanalyse: Zeit pro Feature', fontsize=16)
    ax3.legend(loc='upper right')
    ax3.grid(True, linestyle='--', alpha=0.7)
    
    for x, y in zip(n_features_list, times_per_feature):
        ax3.annotate(f'{y:.3f}s', (x, y), textcoords="offset points", 
                    xytext=(0,10), ha='center', color='g')
    
    plt.tight_layout()
    plt.show()
    
    return elapsed_times, avg_elapsed_times, times_per_feature

def create_random_block_matrix_and_vector(n_rows: int, n_cols: int, row_block_count: int, col_block_count: int, sc, correlation=0.8):
    """
    Erzeugt zufällige BlockMatrizen für Tests mit korrelierten Daten.
    
    Argumente:
        n_rows (int): Anzahl der Zeilen in der Matrix.
        n_cols (int): Anzahl der Spalten in der Matrix.
        row_block_count (int): Anzahl der Zeilen in jedem Block.
        col_block_count (int): Anzahl der Spalten in jedem Block.
        sc: SparkContext für die Parallelisierung.
        correlation (float): Korrelationsfaktor zwischen X und Y.
        
    Rückgabe:
        tuple[BlockMatrix, BlockMatrix]: Korrelierte Merkmalsmatrix X und Zielvektor Y als BlockMatrizen,
        wobei X die Form (n_rows, n_cols) und Y die Form (n_rows, 1) hat.
    """ 
    # Generiere zufällige Matrix X, fülle spaltenweise und transponiere dann zurück
    X_transposed = np.random.rand(n_cols, n_rows)  # Erstellen als transponierte Matrix
    X = X_transposed.T  # Zurück transponieren, um die korrekte Form (n_rows, n_cols) zu erhalten
    
    # Erzeuge korrelierte Matrix Y basierend auf X und dem angegebenen Korrelationsfaktor
    noise = np.random.rand(n_rows, 1) * (1 - correlation)  # Rauschen hinzufügen 
    Y = correlation * X.mean(axis=1).reshape(-1, 1) + noise

    # Konvertiere beide in BlockMatrizen
    X_block_matrix = create_block_matrix_from_numpy(X, row_block_count, col_block_count, sc)
    Y_block_matrix = create_block_matrix_from_numpy(Y, row_block_count, 1, sc)
    
    return X_block_matrix, Y_block_matrix

In [10]:
from pyspark.sql import SparkSession

#Hyperparameter
n_features_list = [5, 8, 10,20,30,40,50,60,100,200,300,400,500,600,700]
n_rows = max(n_features_list) * 30
n_times = 1
n_workers = 4
    # Create Sparksession
spark = SparkSession.builder \
    .appName("Optimized-Cluster-Based-SVD-Regression") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.default.parallelism", n_workers * 2) \
    .config("spark.sql.shuffle.partitions", n_workers * 2) \
    .config("spark.storage.memoryFraction", 0.8) \
    .config("spark.memory.fraction", 0.8) \
    .config("spark.executor.cores", 4) \
    .config("spark.task.cpus", 1) \
    .config("spark.rdd.compress", True) \
    .config("spark.broadcast.compress", True) \
    .config("spark.shuffle.compress", True) \
    .getOrCreate()

sc = spark.sparkContext

# run Performance Test
elapsed_times, avg_times, times_per_feature = run_performance_tests(
    n_features_list=n_features_list,
    n_rows=n_rows,
    n_times=n_times,
    n_workers=n_workers,
    sc=sc,
)

                                                                                

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.985
Model:                            OLS   Adj. R-squared (uncentered):  0.985
Method:                 Least Squares   F-statistic:                277320.357
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             28325.478
No. Observations:               21000   AIC:                        -56640.955
Df Residuals:                   20995   BIC:                        -56601.194
Df Model:                           5
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.199      0.001    146.905      0.000      0.196      0.201
x1             0.198      0.001    146.578      0.000      0.195      0.200
x2             0.197      0.001    1

                                                                                

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.986
Model:                            OLS   Adj. R-squared (uncentered):  0.986
Method:                 Least Squares   F-statistic:                181441.304
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             28982.725
No. Observations:               21000   AIC:                        -57949.451
Df Residuals:                   20992   BIC:                        -57885.833
Df Model:                           8
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.124      0.001     91.636      0.000      0.122      0.127
x1             0.125      0.001     91.786      0.000      0.122      0.127
x2             0.124      0.001     

                                                                                

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.986
Model:                            OLS   Adj. R-squared (uncentered):  0.986
Method:                 Least Squares   F-statistic:                145325.454
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             29059.564
No. Observations:               21000   AIC:                        -58099.128
Df Residuals:                   20990   BIC:                        -58019.606
Df Model:                          10
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.099      0.001     72.016      0.000      0.096      0.102
x1             0.100      0.001     72.626      0.000      0.097      0.103
x2             0.098      0.001     

                                                                                

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.986
Model:                            OLS   Adj. R-squared (uncentered):  0.986
Method:                 Least Squares   F-statistic:                76367.400
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             29707.210
No. Observations:               21000   AIC:                        -59374.421
Df Residuals:                   20980   BIC:                        -59215.375
Df Model:                          20
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.050      0.001     36.812      0.000      0.048      0.053
x1             0.052      0.001     37.543      0.000      0.049      0.055
x2             0.051      0.001     3

24/10/25 17:44:16 WARN TaskSetManager: Stage 5256 contains a task of very large size (1321 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:16 WARN TaskSetManager: Stage 5257 contains a task of very large size (1321 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:18 WARN TaskSetManager: Stage 5260 contains a task of very large size (1321 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:18 WARN TaskSetManager: Stage 5261 contains a task of very large size (1321 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:18 WARN TaskSetManager: Stage 5262 contains a task of very large size (1321 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:18 WARN TaskSetManager: Stage 5263 contains a task of very large size (1321 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:21 WARN TaskSetManager: Stage 5284 contains a task of very large size (1321 KiB). The maximum recommended task size is 10

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                51549.630
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             29834.390
No. Observations:               21000   AIC:                        -59608.780
Df Residuals:                   20970   BIC:                        -59370.212
Df Model:                          30
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.033      0.001     23.843      0.000      0.030      0.036
x1             0.035      0.001     25.102      0.000      0.032      0.037
x2             0.032      0.001     2

24/10/25 17:44:24 WARN TaskSetManager: Stage 5364 contains a task of very large size (1649 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:25 WARN TaskSetManager: Stage 5365 contains a task of very large size (1649 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:26 WARN TaskSetManager: Stage 5368 contains a task of very large size (1649 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:27 WARN TaskSetManager: Stage 5369 contains a task of very large size (1649 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:27 WARN TaskSetManager: Stage 5370 contains a task of very large size (1649 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:27 WARN TaskSetManager: Stage 5371 contains a task of very large size (1649 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:30 WARN TaskSetManager: Stage 5392 contains a task of very large size (1649 KiB). The maximum recommended task size is 10

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                38423.034
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             29804.993
No. Observations:               21000   AIC:                        -59529.985
Df Residuals:                   20960   BIC:                        -59211.894
Df Model:                          40
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.026      0.001     18.912      0.000      0.023      0.029
x1             0.024      0.001     17.436      0.000      0.021      0.027
x2             0.025      0.001     1

24/10/25 17:44:33 WARN TaskSetManager: Stage 5472 contains a task of very large size (2141 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:33 WARN TaskSetManager: Stage 5473 contains a task of very large size (2141 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:35 WARN TaskSetManager: Stage 5476 contains a task of very large size (2141 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:35 WARN TaskSetManager: Stage 5477 contains a task of very large size (2141 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:35 WARN TaskSetManager: Stage 5478 contains a task of very large size (2141 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:36 WARN TaskSetManager: Stage 5479 contains a task of very large size (2141 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:39 WARN TaskSetManager: Stage 5500 contains a task of very large size (2141 KiB). The maximum recommended task size is 10

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                31225.178
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             29970.408
No. Observations:               21000   AIC:                        -59840.816
Df Residuals:                   20950   BIC:                        -59443.202
Df Model:                          50
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.018      0.001     12.722      0.000      0.015      0.020
x1             0.021      0.001     15.299      0.000      0.018      0.024
x2             0.020      0.001     1

24/10/25 17:44:43 WARN TaskSetManager: Stage 5581 contains a task of very large size (2469 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:44 WARN TaskSetManager: Stage 5584 contains a task of very large size (2469 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:45 WARN TaskSetManager: Stage 5585 contains a task of very large size (2469 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:45 WARN TaskSetManager: Stage 5586 contains a task of very large size (2469 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:45 WARN TaskSetManager: Stage 5587 contains a task of very large size (2469 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:48 WARN TaskSetManager: Stage 5608 contains a task of very large size (2469 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:49 WARN TaskSetManager: Stage 5615 contains a task of very large size (2469 KiB). The maximum recommended task size is 10

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                25873.927
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             29953.728
No. Observations:               21000   AIC:                        -59787.455
Df Residuals:                   20940   BIC:                        -59310.319
Df Model:                          60
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.015      0.001     11.237      0.000      0.013      0.018
x1             0.017      0.001     12.624      0.000      0.015      0.020
x2             0.017      0.001     1

24/10/25 17:44:52 WARN TaskSetManager: Stage 5688 contains a task of very large size (4110 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:52 WARN TaskSetManager: Stage 5689 contains a task of very large size (4110 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:55 WARN TaskSetManager: Stage 5692 contains a task of very large size (4110 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:55 WARN TaskSetManager: Stage 5693 contains a task of very large size (4110 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:55 WARN TaskSetManager: Stage 5694 contains a task of very large size (4110 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:56 WARN TaskSetManager: Stage 5695 contains a task of very large size (4110 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:44:59 WARN TaskSetManager: Stage 5716 contains a task of very large size (4110 KiB). The maximum recommended task size is 10

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                15661.033
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             30040.351
No. Observations:               21000   AIC:                        -59880.702
Df Residuals:                   20900   BIC:                        -59085.474
Df Model:                         100
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.011      0.001      7.630      0.000      0.008      0.013
x1             0.009      0.001      6.255      0.000      0.006      0.011
x2             0.011      0.001      

24/10/25 17:45:07 WARN TaskSetManager: Stage 5796 contains a task of very large size (8212 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:45:07 WARN TaskSetManager: Stage 5797 contains a task of very large size (8212 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:45:12 WARN TaskSetManager: Stage 5800 contains a task of very large size (8212 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:45:15 WARN TaskSetManager: Stage 5801 contains a task of very large size (8212 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:45:15 WARN TaskSetManager: Stage 5802 contains a task of very large size (8212 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:45:18 WARN TaskSetManager: Stage 5803 contains a task of very large size (8212 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:45:25 WARN TaskSetManager: Stage 5824 contains a task of very large size (8212 KiB). The maximum recommended task size is 10

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                7849.986
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             30123.183
No. Observations:               21000   AIC:                        -59846.366
Df Residuals:                   20800   BIC:                        -58255.911
Df Model:                         200
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.007      0.001      4.710      0.000      0.004      0.009
x1             0.006      0.001      4.358      0.000      0.003      0.009
x2             0.005      0.001      3

24/10/25 17:45:53 WARN TaskSetManager: Stage 5904 contains a task of very large size (12313 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:45:53 WARN TaskSetManager: Stage 5905 contains a task of very large size (12313 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:46:01 WARN TaskSetManager: Stage 5908 contains a task of very large size (12313 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:46:07 WARN TaskSetManager: Stage 5909 contains a task of very large size (12313 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:46:07 WARN TaskSetManager: Stage 5910 contains a task of very large size (12313 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:46:13 WARN TaskSetManager: Stage 5911 contains a task of very large size (12313 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:46:23 WARN TaskSetManager: Stage 5932 contains a task of very large size (12313 KiB). The maximum recommended task siz

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                5206.332
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             30173.600
No. Observations:               21000   AIC:                        -59747.199
Df Residuals:                   20700   BIC:                        -57361.516
Df Model:                         300
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.002      0.001      1.320      0.187     -0.001      0.005
x1             0.003      0.001      1.856      0.063     -0.000      0.005
x2             0.004      0.001      2

24/10/25 17:47:15 WARN TaskSetManager: Stage 6012 contains a task of very large size (16415 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:47:15 WARN TaskSetManager: Stage 6013 contains a task of very large size (16415 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:47:27 WARN TaskSetManager: Stage 6016 contains a task of very large size (16415 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:47:36 WARN TaskSetManager: Stage 6017 contains a task of very large size (16415 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:47:36 WARN TaskSetManager: Stage 6018 contains a task of very large size (16415 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:47:45 WARN TaskSetManager: Stage 6019 contains a task of very large size (16415 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:48:01 WARN TaskSetManager: Stage 6040 contains a task of very large size (16415 KiB). The maximum recommended task siz

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                3933.372
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             30287.231
No. Observations:               21000   AIC:                        -59774.463
Df Residuals:                   20600   BIC:                        -56593.552
Df Model:                         400
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.006      0.001      4.281      0.000      0.003      0.009
x1             0.001      0.001      0.783      0.434     -0.002      0.004
x2             0.004      0.001      3

24/10/25 17:49:16 WARN TaskSetManager: Stage 6120 contains a task of very large size (20516 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:49:16 WARN TaskSetManager: Stage 6121 contains a task of very large size (20516 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:49:31 WARN TaskSetManager: Stage 6124 contains a task of very large size (20516 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:49:45 WARN TaskSetManager: Stage 6125 contains a task of very large size (20516 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:49:46 WARN TaskSetManager: Stage 6126 contains a task of very large size (20516 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:49:58 WARN TaskSetManager: Stage 6127 contains a task of very large size (20516 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:50:18 WARN TaskSetManager: Stage 6148 contains a task of very large size (20516 KiB). The maximum recommended task siz

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                3141.409
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             30312.525
No. Observations:               21000   AIC:                        -59625.050
Df Residuals:                   20500   BIC:                        -55648.911
Df Model:                         500
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.001      0.001      0.674      0.500     -0.002      0.004
x1             0.002      0.001      1.482      0.138     -0.001      0.005
x2             0.002      0.001      1

24/10/25 17:52:09 WARN TaskSetManager: Stage 6228 contains a task of very large size (24618 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:52:10 WARN TaskSetManager: Stage 6229 contains a task of very large size (24618 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:52:31 WARN TaskSetManager: Stage 6232 contains a task of very large size (24618 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:52:50 WARN TaskSetManager: Stage 6233 contains a task of very large size (24618 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:52:51 WARN TaskSetManager: Stage 6234 contains a task of very large size (24618 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:53:09 WARN TaskSetManager: Stage 6235 contains a task of very large size (24618 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:53:37 WARN TaskSetManager: Stage 6256 contains a task of very large size (24618 KiB). The maximum recommended task siz

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared (uncentered):       0.987
Model:                            OLS   Adj. R-squared (uncentered):  0.987
Method:                 Least Squares   F-statistic:                2611.110
Date:                             now   Prob (F-statistic):           0.000
Time:                             now   Log-Likelihood:             30338.753
No. Observations:               21000   AIC:                        -59477.505
Df Residuals:                   20400   BIC:                        -54706.138
Df Model:                         600
                coef    std err          t      P>|t|     [0.025     0.975]
---------------------------------------------------------------------------
x0             0.001      0.001      0.597      0.551     -0.002      0.004
x1             0.002      0.001      1.625      0.104     -0.000      0.005
x2             0.005      0.001      3

24/10/25 17:56:06 WARN TaskSetManager: Stage 6336 contains a task of very large size (28719 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:56:06 WARN TaskSetManager: Stage 6337 contains a task of very large size (28719 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:56:32 WARN TaskSetManager: Stage 6340 contains a task of very large size (28719 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:56:57 WARN TaskSetManager: Stage 6341 contains a task of very large size (28719 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:56:58 WARN TaskSetManager: Stage 6342 contains a task of very large size (28719 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:57:21 WARN TaskSetManager: Stage 6343 contains a task of very large size (28719 KiB). The maximum recommended task size is 1000 KiB.
24/10/25 17:57:53 WARN TaskSetManager: Stage 6364 contains a task of very large size (28719 KiB). The maximum recommended task siz

Py4JJavaError: An error occurred while calling z:org.apache.spark.mllib.api.python.SerDe.dumps.
: java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3537)
	at java.base/java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:100)
	at java.base/java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:111)
	at org.apache.spark.mllib.api.python.SerDe$DenseMatrixPickler.saveState(PythonMLLibAPI.scala:1429)
	at org.apache.spark.mllib.api.python.SerDeBase$BasePickler.pickle(PythonMLLibAPI.scala:1284)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:297)
	at net.razorvine.pickle.Pickler.save(Pickler.java:185)
	at net.razorvine.pickle.Pickler.dump(Pickler.java:155)
	at net.razorvine.pickle.Pickler.dumps(Pickler.java:140)
	at org.apache.spark.mllib.api.python.SerDeBase.dumps(PythonMLLibAPI.scala:1321)
	at org.apache.spark.mllib.api.python.SerDe.dumps(PythonMLLibAPI.scala)
	at jdk.internal.reflect.GeneratedMethodAccessor47.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
