In [2]:
import os
import sys

# Set the environment variable for PySpark to the current path of the Python interpreter (sys.executable)

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
# Import the required libraries and modules

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.mllib.random import RandomRDDs
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix
from pyspark.ml.regression import LinearRegression
from pyspark.ml.functions import array_to_vector
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from pyspark.mllib.linalg import DenseMatrix
import time
import numpy as np
import seaborn as sns
import pandas as pd
from scipy.linalg import lu, lu_factor, lu_solve # is used for LU decomposition

# Initialize

In [4]:
# Initialization and configuration of the Spark session.

# Set the master node for Spark (running locally with 4 cores on a Spark standalone cluster).
# Using .appName() to set the name for the application, which will be displayed on the user interface.
# With .getOrCreate() an existing Spark session is retrieved, if none exists, one is created
spark = SparkSession.builder \
                    .master('local[4]') \
                    .appName('DBiBD') \
                    .getOrCreate()

In [5]:
# number of rows/samples
n=1000

# number of colums/features
k=3

In [6]:
# Generate random coefficients (betas)
betas=(np.random.rand(k)-0.5)*20
betas

array([-0.97255868, -4.48318715,  0.17640443])

In [7]:
# Generate random covariances (cov)
cov=(np.random.rand(k)-0.5)*20
cov

array([-1.41538752,  7.57379063, -1.66947667])

# generate dataset

In [8]:
# Generate an RDD with n vectors, each containing k entries, where each entry is generated from a standard-normal distribution
# With spark.sparkContext the interface to the Spark cluster is realized. This allows to execute all necessary operations on the cluster
data = RandomRDDs.normalVectorRDD(spark.sparkContext, n, k)

In [9]:
# Applying this function yields an rdd where the first element is a vector with a moderate covariance structure and some added noise 
# for a more realistic setting, while the second element is the target variable.
def createRow(noise):
    x=[]
    x.append(noise[0])
    for i in range(1,len(noise)):
        x.append((x[0]*cov[i])+noise[i])
    x= [float(a) for a in x]
    
    y=0
    for i in range(0,len(x)):
        y+=x[i]*betas[i]
    
    return x,float(y)
    

In [10]:
# Application of the createRow function to each element of the RDD (data)
data=data.map(createRow)

## Covariance

In [11]:
# Create a RowMatrix from the vectors of the RDD (data).
# The RowMatrix offers the advantage that it is optimized for the application of operations from the linear algebra. With it an efficient
# implementation of the calculations can be ensured without the need for additional effort to bring the data into the required format.
dataMatrix=RowMatrix(data.map(lambda x : x[0]))

# Calculation of covariance matrix and conversion to array
dataMatrix.computeCovariance().toArray()

array([[  0.97927137,   7.44326431,  -1.62914619],
       [  7.44326431,  57.58214759, -12.40458928],
       [ -1.62914619, -12.40458928,   3.68981865]])

In [12]:
# A schema is used to define the structure of a data frame.
# The data frame contains the columns features and y.
# The features column consists of an array of floats per row. It shows the values of the independent variables
# The y column has a single float value per row and describes the dependent variable.
schema = StructType([       
    StructField('features', ArrayType(FloatType()), True),
    StructField('y', FloatType(), True)
])

# Create the data frame df using the schema defined at the beginning
dataDF=data.toDF(schema=schema)
# Display of the data frame
dataDF.show(truncate=False)

+--------------------------------------+----------+
|features                              |y         |
+--------------------------------------+----------+
|[1.2034985, 8.455216, -2.79036]       |-39.569023|
|[0.9264144, 8.393093, -1.9257588]     |-38.86851 |
|[1.3961579, 10.412863, -4.0280867]    |-48.751232|
|[-1.0093409, -8.578743, -0.31200716]  |39.386715 |
|[0.91097784, 9.545054, -0.86367697]   |-43.8306  |
|[0.14880404, 1.4307998, -0.19924782]  |-6.5944123|
|[0.030753141, 0.07893534, -1.1931145] |-0.5942618|
|[-1.4463581, -10.559787, 2.5892231]   |49.204918 |
|[0.034043737, -0.39865318, 0.08931492]|1.7698828 |
|[-1.0986181, -9.4023695, 2.0095234]   |43.575542 |
|[-0.9096731, -6.70736, 2.3814182]     |31.375153 |
|[0.7321585, 4.9095526, -1.5275366]    |-22.991976|
|[0.109175295, 0.95224166, 0.84459144] |-4.2262673|
|[-0.1518071, -2.6367342, 1.4115095]   |12.217611 |
|[0.28778645, 0.6972642, -0.7015823]   |-3.5296173|
|[0.9538665, 6.452496, -2.0569303]     |-30.21829 |
|[0.7995248,

# PySpark Linear Regression

In [12]:
# Start the timer
start_time = time.time()

# Create a LinearRegression instance to perform the linear regression
lr = LinearRegression(featuresCol="features", labelCol="y", predictionCol="pred_y")

# Calculate the linear model. To use the method correctly, column features is converted to a vector format
lr_model = lr.fit(dataDF.withColumn("features",array_to_vector('features')))

# Finally, the results of the OLS estimation, the true betas and the total execution time are output
print("PySpark OLS: %s seconds" % (time.time() - start_time))
print("real values:\t\t",betas.round(6))
print("predicted values:\t",lr_model.coefficients.round(6))

PySpark OLS: 8.726112604141235 seconds
real values:		 [ 6.677466 -6.681536  6.642338]
predicted values:	 [ 6.677466 -6.681536  6.642338]


# QR

In [13]:
# Start the timer
start_time = time.time()

# Execute the QR decomposition on the dataMatrix using the tallSkinnyQR method by calculating both matrices Q and R (True).
QR=dataMatrix.tallSkinnyQR(True)

# Output of the Q-matrix, the number of rows and columns 
print(QR.Q, QR.Q.numRows(),"x",QR.Q.numCols())

# Print the R matrix.
print(QR.R)

<pyspark.mllib.linalg.distributed.RowMatrix object at 0x0000018F3194ECD0> 1000 x 3
DenseMatrix([[-32.91823191, 170.68436231, 296.17565899],
             [  0.        ,  30.89025415,  -0.96603129],
             [  0.        ,   0.        , -31.06256769]])


In [14]:
# Conversion of the R matrix into a Numpy matrix
R=np.asmatrix(QR.R.toArray())

# Calculation of the inverse of R
R_inv=np.linalg.inv(R)

# Output of the inverse
R_inv

matrix([[-0.0303783 ,  0.16785557, -0.29487155],
        [ 0.        ,  0.03237267, -0.00100677],
        [-0.        , -0.        , -0.03219309]])

In [15]:
# Creating a CoordinateMatrix (cm) with the Q matrix (QR.Q) via "MatrixEntry" objects
cm = CoordinateMatrix(
    QR.Q.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]
    )
)
# Build the transposed Q-matrix (Q_T) using the CoordinateMatrix (cm) and convert it to a RowMatrix (toRowMatrix())
Q_T=cm.transpose().toRowMatrix()

In [16]:
# The DenseMatrix (y) is built with the parameters n, 1 and the values from the one-dimensional vector.
# The one-dimensional vector is created from the "y" column of the data frame dataDF by converting it to a Numpy array.

y=DenseMatrix(n,1,dataDF.select("y").toPandas().to_numpy().ravel())

## Results

In [18]:
# Performing the matrix multiplications

# Q_T and y are multiplied with the multiply function
# With rows.collect() the calculations of the Spark driver nodes are returned to a local data structure, in this case a Python list
step1 = Q_T.multiply(y).rows.collect() 

# Convert step1 to a numpy array
step1 = np.array(step1)  

# Calculate the betas
step2 = np.matmul(R_inv, step1) 

# Finally, the results of the OLS estimation, the true betas and the total execution time are output
print("QR OLS: %s seconds" % (time.time() - start_time))
print("real values:\t\t", betas.round(6))
print("QR predicted values:\t", step2.ravel().round(6)[0])

QR OLS: 28.485340356826782 seconds
real values:		 [ 6.677466 -6.681536  6.642338]
QR predicted values:	 [ 6.677466 -6.681537  6.642338]


# SVD

In [19]:
# Start the timer
start_time = time.time()

# Compute the singular value decomposition (SVD) of the data matrix dataMatrix with k singular values.
# With computeU = True the both matrices U and V are calculated.
svd=dataMatrix.computeSVD(k,computeU=True)

# Output of the singular values
print(svd.s)

# Output of the matrix V
print(svd.V)

# Output of the matrix U and the properties if it´s
print(svd.U, svd.U.numRows(),"x",svd.U.numCols())

[344.76853943586775,31.351581604986123,2.9221886574357034]
DenseMatrix([[-0.09510553, -0.00195771, -0.99546527],
             [ 0.49690419,  0.86641087, -0.04917753],
             [ 0.86257821, -0.49932792, -0.08142766]])
<pyspark.mllib.linalg.distributed.RowMatrix object at 0x0000018F33AD3D90> 1000 x 3


In [20]:
# Creating a CoordinateMatrix (cm) with the U-matrix (svd.U) via "MatrixEntry" objects
cm = CoordinateMatrix(
    svd.U.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]
    )
)
# Build the transposed U-matrix (U_T) using the CoordinateMatrix (cm) and convert it to a RowMatrix (toRowMatrix())
U_T=cm.transpose().toRowMatrix()

## Results

In [22]:
# U_T and y are multiplied with the multiply function
# With rows.collect() the calculations of the Spark driver nodes are returned to a local data structure, in this case a Python list
step1=U_T.multiply(y).rows.collect()

# Calculate the OLS estimator according to calculation rule
step2=(np.array(step1).ravel()/svd.s)
v=np.matrix(svd.V.toArray())
SVD_coeeffs=(v @ step2).ravel()

# Finally, the results of the OLS estimation, the true betas and the total execution time are output
print("SVD OLS: %s seconds" % (time.time() - start_time))
print("real values:\t\t",betas.round(6))
print("SVD predicted values:\t",SVD_coeeffs.round(6)[0])

SVD OLS: 24.881237030029297 seconds
real values:		 [ 6.677466 -6.681536  6.642338]
SVD predicted values:	 [ 6.677466 -6.681537  6.642338]


# LU
als Data Paralleism Ansatz da keine LU Funktion in PySpark enthalten

In [13]:
# Start the timer
start_time = time.time()

In [14]:
def luSpark(part):
    # Die Größe der Partition ermitteln
    partition_n = len(part)
    
    # Array Features in nparray schreiben damit die struktur mit .T transponiert werden kann
    lufeatures = np.array([b for b in part["features"].to_numpy()])
    ypanda = part["y"].to_numpy()

    # Durchführen der LU-Composition
    LUtemp, pivtemp = lu_factor(lufeatures.T @ lufeatures)
    lubetas = lu_solve((LUtemp, pivtemp), lufeatures.T @ ypanda)
    
    # Eine DataFrame mit den geschätzten Betas und der Anzahl der Beobachtungen erstellen
    return pd.DataFrame({"betas": [lubetas], "sampleCounts": [partition_n]})

In [15]:
# Define the scheme
schemaUDF=StructType([
    StructField("betas",ArrayType(FloatType())),
    StructField("sampleCounts",IntegerType())
])

In [16]:
# Apply the "luSpark" function to the partition groups of the "dataDF" DataFrame
# Calculate and average the coefficients for each partition
LU_res = dataDF.groupBy(F.spark_partition_id()).applyInPandas(luSpark,schema=schemaUDF)
LU_res.show(truncate=False)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Users\Casimir\AppData\Local\Temp\ipykernel_13532\3263207875.py", line 11, in luSpark
  File "c:\Users\Casimir\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\linalg\_decomp_lu.py", line 140, in lu_solve
    raise ValueError("Shapes of lu {} and b {} are incompatible"
ValueError: Shapes of lu (3, 3) and b (1, 3) are incompatible


In [None]:
# calculate weighted average of estimated betas for all partitions
LU_betas = pd.DataFrame(LU_res.rdd.map(lambda x : [(x["sampleCounts"]/n) * xi for xi in x["betas"]]).collect()).sum().to_numpy()

In [None]:
# Finally, the results of the OLS estimation, the true betas and the total execution time are output
print("LU OLS: %s seconds" % (time.time() - start_time))
print("real values:\t\t",betas.round(6))
print("LU predicted values :\t",LU_betas.round(6))

LU OLS: 32.74242973327637 seconds
real values:		 [ 6.677466 -6.681536  6.642338]
LU predicted values :	 [ 6.677938 -6.681465  6.642348]


In [None]:
# End Spark Session
spark.stop()