In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.mllib.random import RandomRDDs
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix
from pyspark.ml.regression import LinearRegression
from pyspark.ml.functions import array_to_vector
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from pyspark.mllib.linalg import DenseMatrix
import time
import numpy as np
import seaborn as sns
import pandas as pd
from scipy.linalg import lu, lu_factor, lu_solve # wird für LU-Decomposition genutzt 

# Initialize

In [4]:
n_jobs=1
spark = SparkSession.builder \
                    .master('local[4]') \
                    .appName('DBiBD') \
                    .getOrCreate()

In [5]:
n=1000
k=3

In [6]:
betas=(np.random.rand(k)-0.5)*20
betas

array([-0.97255868, -4.48318715,  0.17640443])

In [7]:
cov=(np.random.rand(k)-0.5)*20
cov

array([-1.41538752,  7.57379063, -1.66947667])

# generate dataset

In [8]:
data = RandomRDDs.normalVectorRDD(spark.sparkContext, n, k)

In [9]:
def createRow(noise):
    x=[]
    x.append(noise[0])
    for i in range(1,len(noise)):
        x.append((x[0]*cov[i])+noise[i])
    x= [float(a) for a in x]
    
    y=0
    for i in range(0,len(x)):
        y+=x[i]*betas[i]
    
    return x,float(y)
    

In [10]:
data=data.map(createRow)

## Covariance

In [11]:
dataMatrix=RowMatrix(data.map(lambda x : x[0]))
dataMatrix.computeCovariance().toArray()

array([[  0.97927137,   7.44326431,  -1.62914619],
       [  7.44326431,  57.58214759, -12.40458928],
       [ -1.62914619, -12.40458928,   3.68981865]])

In [12]:
schema = StructType([       
    StructField('features', ArrayType(FloatType()), True),
    StructField('y', FloatType(), True)
])


dataDF=data.toDF(schema=schema)
dataDF.show(truncate=False)

+--------------------------------------+----------+
|features                              |y         |
+--------------------------------------+----------+
|[1.2034985, 8.455216, -2.79036]       |-39.569023|
|[0.9264144, 8.393093, -1.9257588]     |-38.86851 |
|[1.3961579, 10.412863, -4.0280867]    |-48.751232|
|[-1.0093409, -8.578743, -0.31200716]  |39.386715 |
|[0.91097784, 9.545054, -0.86367697]   |-43.8306  |
|[0.14880404, 1.4307998, -0.19924782]  |-6.5944123|
|[0.030753141, 0.07893534, -1.1931145] |-0.5942618|
|[-1.4463581, -10.559787, 2.5892231]   |49.204918 |
|[0.034043737, -0.39865318, 0.08931492]|1.7698828 |
|[-1.0986181, -9.4023695, 2.0095234]   |43.575542 |
|[-0.9096731, -6.70736, 2.3814182]     |31.375153 |
|[0.7321585, 4.9095526, -1.5275366]    |-22.991976|
|[0.109175295, 0.95224166, 0.84459144] |-4.2262673|
|[-0.1518071, -2.6367342, 1.4115095]   |12.217611 |
|[0.28778645, 0.6972642, -0.7015823]   |-3.5296173|
|[0.9538665, 6.452496, -2.0569303]     |-30.21829 |
|[0.7995248,

# PySpark Linear Regression

In [12]:
start_time = time.time()
lr = LinearRegression(featuresCol="features", labelCol="y", predictionCol="pred_y")
lr_model = lr.fit(dataDF.withColumn("features",array_to_vector('features')))
print("PySpark OLS: %s seconds" % (time.time() - start_time))
print("real values:\t\t",betas.round(6))
print("predicted values:\t",lr_model.coefficients.round(6))

PySpark OLS: 8.726112604141235 seconds
real values:		 [ 6.677466 -6.681536  6.642338]
predicted values:	 [ 6.677466 -6.681536  6.642338]


# QR

In [13]:
start_time = time.time()
QR=dataMatrix.tallSkinnyQR(True)
print(QR.Q, QR.Q.numRows(),"x",QR.Q.numCols())
print(QR.R)

<pyspark.mllib.linalg.distributed.RowMatrix object at 0x0000018F3194ECD0> 1000 x 3
DenseMatrix([[-32.91823191, 170.68436231, 296.17565899],
             [  0.        ,  30.89025415,  -0.96603129],
             [  0.        ,   0.        , -31.06256769]])


In [14]:
R=np.asmatrix(QR.R.toArray())
R_inv=np.linalg.inv(R)
R_inv

matrix([[-0.0303783 ,  0.16785557, -0.29487155],
        [ 0.        ,  0.03237267, -0.00100677],
        [-0.        , -0.        , -0.03219309]])

In [15]:
cm = CoordinateMatrix(
    QR.Q.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]
    )
)
Q_T=cm.transpose().toRowMatrix()

In [16]:
y=DenseMatrix(n,1,dataDF.select("y").toPandas().to_numpy().ravel())

In [17]:
# betas = R_inv Q_T y

## Results

In [18]:
step1=Q_T.multiply(y).rows.collect()
step1=np.array(step1)
step2= np.matmul(R_inv,step1)

print("QR OLS: %s seconds" % (time.time() - start_time))
print("real values:\t\t",betas.round(6))
print("QR predicted values:\t",step2.ravel().round(6)[0])

QR OLS: 28.485340356826782 seconds
real values:		 [ 6.677466 -6.681536  6.642338]
QR predicted values:	 [ 6.677466 -6.681537  6.642338]


# SVD

In [19]:
start_time = time.time()
svd=dataMatrix.computeSVD(k,computeU=True)
print(svd.s)
print(svd.V)
print(svd.U, svd.U.numRows(),"x",svd.U.numCols())

[344.76853943586775,31.351581604986123,2.9221886574357034]
DenseMatrix([[-0.09510553, -0.00195771, -0.99546527],
             [ 0.49690419,  0.86641087, -0.04917753],
             [ 0.86257821, -0.49932792, -0.08142766]])
<pyspark.mllib.linalg.distributed.RowMatrix object at 0x0000018F33AD3D90> 1000 x 3


In [20]:
cm = CoordinateMatrix(
    svd.U.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]
    )
)
U_T=cm.transpose().toRowMatrix()

In [21]:
#betas = V * ((U^T * y) / s element wise)

## Results

In [22]:
step1=U_T.multiply(y).rows.collect()
step2=(np.array(step1).ravel()/svd.s)
v=np.matrix(svd.V.toArray())

SVD_coeeffs=(v @ step2).ravel()
print("SVD OLS: %s seconds" % (time.time() - start_time))
print("real values:\t\t",betas.round(6))
print("SVD predicted values:\t",SVD_coeeffs.round(6)[0])

SVD OLS: 24.881237030029297 seconds
real values:		 [ 6.677466 -6.681536  6.642338]
SVD predicted values:	 [ 6.677466 -6.681537  6.642338]


# LU
als Data Paralleism Ansatz da keine LU Funktion in PySpark enthalten

In [13]:
start_time = time.time()

In [14]:
def luSpark(part):
    # Die Größe der Partition ermitteln
    partition_n = len(part)
    
    # Array Features in nparray schreiben damit die struktur mit .T transponiert werden kann
    lufeatures = np.array([b for b in part["features"].to_numpy()])
    ypanda = part["y"].to_numpy()

    # Durchführen der LU-Composition
    LUtemp, pivtemp = lu_factor(lufeatures.T @ lufeatures)
    lubetas = lu_solve((LUtemp, pivtemp), lufeatures.T @ ypanda)
    
    # Eine DataFrame mit den geschätzten Betas und der Anzahl der Beobachtungen erstellen
    return pd.DataFrame({"betas": [lubetas], "sampleCounts": [partition_n]})

In [15]:
schemaUDF=StructType([
    StructField("betas",ArrayType(FloatType())),
    StructField("sampleCounts",IntegerType())
])

für jede Partition die Koeffizienten berechnen und mitteln

In [16]:

LU_res = dataDF.groupBy(F.spark_partition_id()).applyInPandas(luSpark,schema=schemaUDF)

LU_res.show(truncate=False)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Users\Casimir\AppData\Local\Temp\ipykernel_13532\3263207875.py", line 11, in luSpark
  File "c:\Users\Casimir\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\linalg\_decomp_lu.py", line 140, in lu_solve
    raise ValueError("Shapes of lu {} and b {} are incompatible"
ValueError: Shapes of lu (3, 3) and b (1, 3) are incompatible


weighted average berechnen

In [None]:
LU_betas = pd.DataFrame(LU_res.rdd.map(lambda x : [(x["sampleCounts"]/n) * xi for xi in x["betas"]]).collect()).sum().to_numpy()

In [None]:
print("LU OLS: %s seconds" % (time.time() - start_time))
print("real values:\t\t",betas.round(6))
print("LU predicted values :\t",LU_betas.round(6))

LU OLS: 32.74242973327637 seconds
real values:		 [ 6.677466 -6.681536  6.642338]
LU predicted values :	 [ 6.677938 -6.681465  6.642348]


In [None]:
spark.stop()