In [1]:
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, FloatType

In [2]:
def norm_2_func(vectors):
    return float(np.linalg.norm(vectors, 2))

norm_2_udf = F.udf(lambda x: norm_2_func(x), FloatType())

In [23]:
def get_unit_vectors(vectors):
    return list(map(float, vectors/np.linalg.norm(vectors, 2)))

get_unit_vectors = F.udf(get_unit_vectors, ArrayType(FloatType()))

In [4]:
df = spark.read.text('data.txt')

In [5]:
df.show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                 |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[-7456.4745, 62.698, -5430.6106, -5047.9419, -3115.541, -2617.9648, -4505.6079, 2840.1761, -432.0275, 7651.0525, 8569.9486, -6567.4452, 8257.3935, -6857.2304, -7036.5599, -5613.7268, 7545.5825, -8260.8731, -6922.2269, -3106.9112] |
|[9337.3161, 6880.0131, 7213.6268, -3985.7916, -4381.2732, -3682.903

In [6]:
from pyspark.sql.functions import split, expr, flatten, col
from pyspark.ml.linalg import VectorUDT
from pyspark.ml.linalg import Vectors

In [7]:
array_to_vector = F.udf(lambda a: Vectors.dense(a), VectorUDT())

In [8]:
df = df.withColumn("value", split(expr("rtrim(']', ltrim('[', value))"), ",")) \
       .withColumn("value", expr("""transform(value, x -> split(rtrim(']', ltrim('[', x)), ","))"""))\
       .withColumn("value", flatten(col("value")).cast("array<float>"))\
       .withColumn("value", array_to_vector(col("value")))

In [9]:
df.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                                                                                                                                              |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
df2 = df.withColumn('euclidean_norms', norm_2_udf(F.col('value')))

df2.head(1)

[Row(value=DenseVector([-7456.4746, 62.698, -5430.6108, -5047.9419, -3115.541, -2617.9648, -4505.6079, 2840.176, -432.0275, 7651.0527, 8569.9482, -6567.4453, 8257.3936, -6857.2305, -7036.5601, -5613.7266, 7545.5825, -8260.873, -6922.2271, -3106.9111]), euclidean_norms=26641.798828125)]

In [11]:
percentage = 10000/df2.count()
print(percentage)

0.01


In [12]:
from pyspark.sql.functions import percentile_approx

In [13]:
df2.summary().show()

+-------+-----------------+
|summary|  euclidean_norms|
+-------+-----------------+
|  count|          1000000|
|   mean|25685.35096683496|
| stddev|2609.796159577555|
|    min|        11651.874|
|    25%|        23963.074|
|    50%|        25753.686|
|    75%|         27481.87|
|    max|         37056.57|
+-------+-----------------+



In [14]:
df2.select(percentile_approx('euclidean_norms', [0.01, 0.5, 0.95], 1000000).alias('quantiles')).show(truncate = False)

+---------------------------------+
|quantiles                        |
+---------------------------------+
|[19312.668, 25754.148, 29858.316]|
+---------------------------------+



In [15]:
t1 = df2.select(percentile_approx('euclidean_norms', 0.01, 1000000).alias('threshold_quantile')).head()[0]

In [16]:
t1

19312.66796875

In [25]:
df3 = df2.withColumn('unit_vectors', get_unit_vectors(F.col('value')))

df3.head(1)

[Row(value=DenseVector([-7456.4746, 62.698, -5430.6108, -5047.9419, -3115.541, -2617.9648, -4505.6079, 2840.176, -432.0275, 7651.0527, 8569.9482, -6567.4453, 8257.3936, -6857.2305, -7036.5601, -5613.7266, 7545.5825, -8260.873, -6922.2271, -3106.9111]), euclidean_norms=26641.798828125, unit_vectors=[-0.27987879514694214, 0.0023533697240054607, -0.20383799076080322, -0.18947450816631317, -0.11694183945655823, -0.09826532006263733, -0.1691180020570755, 0.10660601407289505, -0.016216153278946877, 0.2871822714805603, 0.32167303562164307, -0.24650907516479492, 0.30994129180908203, -0.25738614797592163, -0.2641173005104065, -0.21071124076843262, 0.2832234501838684, -0.3100718855857849, -0.25982579588890076, -0.11661791801452637])]