# KNN

In [None]:
import numpy as np

from spark_rapids_ml.knn import NearestNeighbors
from pyspark.sql import DataFrame, Row
from pyspark.sql.functions import col, lit
from pyspark.sql.types import IntegerType, StructType, StructField

### Create synthetic dataset

In [None]:
item = [
    ([1.0, 1.0], "a"),
    ([2.0, 2.0], "b"),
    ([3.0, 3.0], "c"),
    ([4.0, 4.0], "d"),
    ([5.0, 5.0], "e"),
    ([6.0, 6.0], "f"),
    ([7.0, 7.0], "g"),
    ([8.0, 8.0], "h"),
]

query = [
    ([0.0, 0.0], "aa"),
    ([1.0, 1.0], "bb"),
    ([4.1, 4.1], "cc"),
    ([8.0, 8.0], "dd"),
    ([9.0, 9.0], "ee"),
]

### Convert dataset to Spark DataFrame

In [None]:
schema = f"features array<float>, metadata string"
item_df = spark.createDataFrame(item, schema)
query_df = spark.createDataFrame(query, schema)

## Spark RAPIDS ML (GPU)

In [None]:
knn = NearestNeighbors(k=2)
knn.setInputCol("features")

In [None]:
# Note: `fit` just stores a reference to the item_df in the model
knn_model = knn.fit(item_df)

Note: saving either the estimator or model is not supported, since their only state is the referenced dataset.
Just re-create and re-fit the estimator on the dataset when needed.

#### kneighbors

This API takes a DataFrame of query vectors, and returns the `k` nearest item vectors for each query vector, represented by their unique ids and distances.  The unique ids are automatically generated if not provided, so the input datasets are also returned with their unique ids.

In [None]:
item_id_df, query_id_df, neighbor_df = knn_model.kneighbors(query_df)

In [None]:
# original item_df is returned with unique identifiers
item_id_df.show()

In [None]:
# original query_df is returned with unique identifiers
query_id_df.show()

In [None]:
# neighbor_df shows the nearest item vectors for each query vector, represented by their unique ids and distances.
neighbor_df.show()

In [None]:
# change the value of 'k'
knn_model.setK(3)
_, _, neighbor_df = knn_model.kneighbors(query_df)

In [None]:
neighbor_df.show()

#### exactNearestNeighborsJoin

This API returns a join of the query vectors and their `k` closest item vectors.

In [None]:
result_df = knn_model.exactNearestNeighborsJoin(query_df)

In [None]:
result_df.orderBy("query_df", "item_df").show()

# PySpark

Note: PySpark does not have an exact kNN implementation, but it does have an LSH-based approximate nearest neighbors implementation, shown here.

In [None]:
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.functions import array_to_vector
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

item_vector_df = item_df.select(array_to_vector(item_df.features).alias("features"))
query_vector_df = query_df.select(array_to_vector(query_df.features).alias("features"))
key = Vectors.dense([1.0, 1.0])

In [None]:
item_vector_df.show()

In [None]:
query_vector_df.show()

In [None]:
brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=2.0, numHashTables=3)
model = brp.fit(item_vector_df)

In [None]:
# Feature Transformation
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(item_vector_df).show()

In [None]:
# Compute the locality sensitive hashes for the input rows, then perform approximate similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
print("Approximately joining items and queries on Euclidean distance smaller than 3.0:")
model.approxSimilarityJoin(item_vector_df, query_vector_df, 3.0, distCol="EuclideanDistance")\
    .select(col("datasetA.features").alias("item"),
            col("datasetB.features").alias("query"),
            col("EuclideanDistance")).orderBy("query", "item").show()

# results = model.approxSimilarityJoin(item_vector_df, query_vector_df, 1.5, distCol="EuclideanDistance")

In [None]:
# Compute the locality sensitive hashes for the input rows, then perform approximate nearest neighbor search.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxNearestNeighbors(transformedA, key, 2)`
print("Approximately searching item vectors for 2 nearest neighbors of the key:")
model.approxNearestNeighbors(item_vector_df, key, 2).show()

In [None]:
# saves the LSH hashes for the input rows
model.write().overwrite().save("/tmp/ann_model")