In [None]:
from pyspark import SparkContext
from PIL import Image
import numpy as np

# initialize spark context
sc = SparkContext("local", "simhash_lsh")

In [58]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType, FloatType, ArrayType
from pyspark.ml.feature import HashingTF

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSH").getOrCreate()

# 创建示例数据
# data = [
#     ("document1", "The quick brown fox"),
#     ("document2", "The lazy black dog"),
#     ("document3", "The quick brown cat"),
#     ("document4", "The lazy black cat"),
# ]
data = [
    ("document1", "you could create a dataset of job descriptions and calculate the candidate overlap"),
    ("document2", "you could do a dataset of job descriptions or calculate the candidate overlap"),
    ("document3", "the document similarity is subjective and in the eyes of the client"),
    ("document4", "The lazy black cat"),
]
df = spark.createDataFrame(data, ["document_id", "document"])

# 定义 SimHash 函数
def simhash(document):
    # 将文档拆分为单词
    words = document.split(" ")

    # 计算每个单词的 SimHash 值
    hashes = []
    for word in words:
        word_hash = hash(word)
        # 使用 64 位 SimHash 值，将每个单词的哈希值转换为二进制表示，并填充到 64 位
        binary_hash = format(word_hash, "064b")
        # 将二进制表示的哈希值转换为 DenseVector，每个元素值为 -1 或 1
        vector = [1 if b == "1" else -1 for b in binary_hash]
        hashes.append(vector)
    # 将所有单词的 SimHash 值合并到一个数组中
    simhash_value = [0] * 64
    for v in hashes:
        for i in range(64):
            simhash_value[i] += v[i]
    simhash_value = [1 if x > 0 else 0 for x in simhash_value]
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, ArrayType(IntegerType()))

# 对文档进行 SimHash 计算
df = df.withColumn("simhash", simhash_udf(df["document"]))

# # 创建 Tokenizer 特征提取器
# tokenizer = Tokenizer(inputCol="document", outputCol="words")

# # 对文档进行分词
# df = tokenizer.transform(df)

# # 将 SimHash 值映射到桶(bucket)
# num_buckets = 4 # 桶(bucket)的数量
# hashingTF = HashingTF(inputCol="simhash", outputCol="hashed_features", numFeatures=num_buckets)
# df = hashingTF.transform(df)
# df.show()

# 定义 Jaccard 相似度计算函数
def jaccard_similarity(simhash1, simhash2):
    length = len(simhash1)
    cnt = sum([1 for i in range(length) if simhash1[i] == simhash2[i]])
    return cnt / length

# 定义 Hamming Distance 函数
def hamming_distance(simhash1, simhash2):
    # 计算两个 SimHash 值的 Hamming 距离
    distance = sum([1 for i in range(64) if simhash1[i] != simhash2[i]])
    return distance

hamming_distance_udf = udf(hamming_distance, IntegerType())

# 将 Jaccard 相似度计算函数注册为 UDF
jaccard_similarity_udf = udf(jaccard_similarity, FloatType())

# 计算文档之间的 Jaccard 相似度
similar_documents = df.alias("a").join(df.alias("b"), col("a.document_id") < col("b.document_id")) \
    .select(col("a.document_id").alias("document_id1"),
            col("b.document_id").alias("document_id2"),
            jaccard_similarity_udf(col("a.simhash"), col("b.simhash")).alias("similarity"),
            hamming_distance_udf(col("a.simhash"), col("b.simhash")).alias("distance"))

# 设置阈值过滤出相似的文档
threshold = 20
# similar_documents = similar_documents.filter(col("distance") < threshold)

# 打印相似文档
similar_documents.show(truncate=False)

+------------+------------+----------+--------+
|document_id1|document_id2|similarity|distance|
+------------+------------+----------+--------+
|document1   |document2   |0.8125    |12      |
|document1   |document3   |0.640625  |23      |
|document1   |document4   |0.546875  |29      |
|document2   |document3   |0.609375  |25      |
|document2   |document4   |0.453125  |35      |
|document3   |document4   |0.5625    |28      |
+------------+------------+----------+--------+



In [121]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import udf, col, split, concat_ws, substring, expr, when
from pyspark.sql.types import IntegerType, FloatType, ArrayType
from pyspark.ml.feature import HashingTF

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSH").getOrCreate()

# 创建示例数据
# data = [
#     ("document1", "The quick brown fox"),
#     ("document2", "The lazy black dog"),
#     ("document3", "The quick brown cat"),
#     ("document4", "The lazy black cat"),
# ]
data = [
    ("document1", "you could create a dataset of job descriptions and calculate the candidate overlap"),
    ("document2", "you could do a dataset of job descriptions or calculate the candidate overlap"),
    ("document3", "the document similarity is subjective and in the eyes of the client"),
    ("document4", "The lazy black cat"),
]
df = spark.createDataFrame(data, ["document_id", "document"])
print(df.rdd.getNumPartitions())

# 定义 SimHash 函数
def simhash(document):
    # 将文档拆分为单词
    words = document.split(" ")

    # 计算每个单词的 SimHash 值
    hashes = []
    for word in words:
        word_hash = hash(word)
        # 使用 64 位 SimHash 值，将每个单词的哈希值转换为二进制表示，并填充到 64 位
        binary_hash = format(word_hash, "064b")
        # 将二进制表示的哈希值转换为 DenseVector，每个元素值为 -1 或 1
        vector = [1 if b == "1" else -1 for b in binary_hash]
        hashes.append(vector)
    # 将所有单词的 SimHash 值合并到一个数组中
    simhash_value = [0] * 64
    for v in hashes:
        for i in range(64):
            simhash_value[i] += v[i]
    simhash_value = [1 if x > 0 else 0 for x in simhash_value]
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, ArrayType(IntegerType()))

# 对文档进行 SimHash 计算
df = df.withColumn("simhash", simhash_udf(df["document"]))

# 定义 Jaccard 相似度计算函数 (hamming distance / length)
def jaccard_similarity(simhash1, simhash2):
    length = len(simhash1)
    cnt = sum([1 for i in range(length) if simhash1[i] == simhash2[i]])
    return cnt / length

# 将 Jaccard 相似度计算函数注册为 UDF
jaccard_similarity_udf = udf(jaccard_similarity, FloatType())

# 定义 Hamming Distance 函数
def hamming_distance(simhash1, simhash2):
    # 计算两个 SimHash 值的 Hamming 距离
    distance = sum([1 for i in range(64) if simhash1[i] != simhash2[i]])
    return distance

# 将 Hamming Distance 计算函数注册为 UDF
hamming_distance_udf = udf(hamming_distance, IntegerType())

# 计算文档之间的 Jaccard 相似度
similar_documents = df.alias("a").join(df.alias("b"), col("a.document_id") < col("b.document_id")) \
    .select(col("a.document_id").alias("document_id1"),
            col("b.document_id").alias("document_id2"),
            jaccard_similarity_udf(col("a.simhash"), col("b.simhash")).alias("similarity"),
            hamming_distance_udf(col("a.simhash"), col("b.simhash")).alias("distance"))

# 设置阈值过滤出相似的文档
threshold = 0
# similar_documents = similar_documents.filter(col("similarity") > threshold)

# 打印相似文档
similar_documents.show(truncate=False)

# 将 SimHash 值映射到桶(bucket)
num_buckets = 16 # 桶(bucket)的数量

# 使用 split 函数将数组列拆分为多个列，并计算每列应包含的元素数量
array_len = len(df.first()['simhash'])
elements_per_bucket = int(array_len / num_buckets)

# 循环创建新的数组列，并使用 split 函数拆分源数组列中的元素
for i in range(num_buckets):
    # 计算拆分的起始和结束位置
    start = i * elements_per_bucket
    end = start + elements_per_bucket
    # 使用 split 函数拆分数组列，并为新列命名
    new_col_name = f"bucket_{i}"
    df = df.withColumn(new_col_name, expr(f"slice(simhash, {start + 1}, {end - start})"))
df.show()

# 定义要比较的列名列表
columns_to_compare = [f"bucket_{i}" for i in range(num_buckets)]

# 计算每两行之间有多少列相同
matching_columns_expr = [expr(f"CASE WHEN df1.{col_name} = df2.{col_name} THEN 1 ELSE 0 END as matching_buckets_{col_name}") for col_name in columns_to_compare]

# 使用 select 函数选择需要的列，并显示结果
df_result = df.alias("df1").join(df.alias("df2"), col("df1.document_id") < col("df2.document_id"))\
    .select("df1.document_id", "df2.document_id", *matching_columns_expr)
df_result.show()

# 动态生成列名的累加求和操作
matching_columns_sum_expr = "+".join([f"matching_buckets_{col_name}" for col_name in columns_to_compare])
# 使用 join 函数连接两个 DataFrame，并使用 groupBy 和 sum 函数进行累加求和操作
df_result = df.alias("df1").join(df.alias("df2"), col("df1.document_id") < col("df2.document_id"))\
    .select('*', *matching_columns_expr)\
    .groupBy("df1.document_id", "df2.document_id")\
    .agg(expr(f"sum({'+'.join([f'matching_buckets_{col_name}' for col_name in columns_to_compare])}) as matching_buckets_sum"))

# 显示结果
df_result.show()

10
+------------+------------+----------+--------+
|document_id1|document_id2|similarity|distance|
+------------+------------+----------+--------+
|document1   |document2   |0.8125    |12      |
|document1   |document3   |0.640625  |23      |
|document1   |document4   |0.546875  |29      |
|document2   |document3   |0.609375  |25      |
|document2   |document4   |0.453125  |35      |
|document3   |document4   |0.5625    |28      |
+------------+------------+----------+--------+

+-----------+--------------------+--------------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+
|document_id|            document|             simhash|    bucket_0|    bucket_1|    bucket_2|    bucket_3|    bucket_4|    bucket_5|    bucket_6|    bucket_7|    bucket_8|    bucket_9|   bucket_10|   bucket_11|   bucket_12|   bucket_13|   bucket_14|  



In [131]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import udf, col, split, concat_ws, substring, expr, when
from pyspark.sql.types import IntegerType, FloatType, ArrayType
from pyspark.ml.feature import HashingTF

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSH").getOrCreate()

# 创建示例数据
# data = [
#     ("document1", "The quick brown fox"),
#     ("document2", "The lazy black dog"),
#     ("document3", "The quick brown cat"),
#     ("document4", "The lazy black cat"),
# ]
data = [
    ("document1", "you could create a dataset of job descriptions and calculate the candidate overlap"),
    ("document2", "you could do a dataset of job descriptions or calculate the candidate overlap"),
    ("document3", "the document similarity is subjective and in the eyes of the client"),
    ("document4", "The lazy black cat"),
]
df = spark.createDataFrame(data, ["document_id", "document"])
print(df.rdd.getNumPartitions())

# 定义 SimHash 函数
def simhash(document):
    # 将文档拆分为单词
    words = document.split(" ")

    # 计算每个单词的 SimHash 值
    hashes = []
    for word in words:
        word_hash = hash(word)
        # 使用 64 位 SimHash 值，将每个单词的哈希值转换为二进制表示，并填充到 64 位
        binary_hash = format(word_hash, "064b")
        # 将二进制表示的哈希值转换为 DenseVector，每个元素值为 -1 或 1
        vector = [1 if b == "1" else -1 for b in binary_hash]
        hashes.append(vector)
    # 将所有单词的 SimHash 值合并到一个数组中
    simhash_value = [0] * 64
    for v in hashes:
        for i in range(64):
            simhash_value[i] += v[i]
    simhash_value = [1 if x > 0 else 0 for x in simhash_value]
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, ArrayType(IntegerType()))

# 对文档进行 SimHash 计算
df = df.withColumn("simhash", simhash_udf(df["document"]))

# 定义 Hamming Distance 函数
def hamming_distance(simhash1, simhash2):
    # 计算两个 SimHash 值的 Hamming 距离
    distance = sum([1 for i in range(64) if simhash1[i] != simhash2[i]])
    return distance

# 将 Hamming Distance 计算函数注册为 UDF
hamming_distance_udf = udf(hamming_distance, IntegerType())

# 计算文档之间的 Jaccard 相似度
similar_documents = df.alias("a").join(df.alias("b"), col("a.document_id") < col("b.document_id")) \
    .select(col("a.document_id").alias("document_id1"),
            col("b.document_id").alias("document_id2"),
            hamming_distance_udf(col("a.simhash"), col("b.simhash")).alias("distance"))

# 设置阈值过滤出相似的文档
threshold = 64
# similar_documents = similar_documents.filter(col("distance") <= threshold)

# 打印相似文档
similar_documents.show(truncate=False)

# 将 SimHash 值映射到桶(bucket)
num_buckets = 16 # 桶(bucket)的数量

# 使用 split 函数将数组列拆分为多个列，并计算每列应包含的元素数量
array_len = len(df.first()['simhash'])
elements_per_bucket = int(array_len / num_buckets)

# 循环创建新的数组列，并使用 split 函数拆分源数组列中的元素
for i in range(num_buckets):
    # 计算拆分的起始和结束位置
    start = i * elements_per_bucket
    end = start + elements_per_bucket
    # 使用 split 函数拆分数组列，并为新列命名
    new_col_name = f"bucket_{i}"
    df = df.withColumn(new_col_name, expr(f"slice(simhash, {start + 1}, {end - start})"))
# df.show()

# 要比较的buckets list
columns_to_compare = [f"bucket_{i}" for i in range(num_buckets)]

# 计算每两个documents之间有多少buckets相同
matching_columns_expr = [expr(f"CASE WHEN df1.{col_name} = df2.{col_name} THEN 1 ELSE 0 END as matching_buckets_{col_name}") for col_name in columns_to_compare]

# 使用 select 函数选择需要的列，并显示结果
# df_result = df.alias("df1").join(df.alias("df2"), col("df1.document_id") < col("df2.document_id"))\
#     .select("df1.document_id", "df2.document_id", *matching_columns_expr)
# df_result.show()

# 累加求和操作
matching_columns_sum_expr = "+".join([f"matching_buckets_{col_name}" for col_name in columns_to_compare])
# 使用 join 函数连接两个 DataFrame，并使用 groupBy 和 sum 函数进行累加求和操作
df = df.alias("df1").join(df.alias("df2"), col("df1.document_id") < col("df2.document_id"))\
    .select('*', *matching_columns_expr)\
    .groupBy("df1.document_id", "df2.document_id", hamming_distance_udf(col("df1.simhash"), col("df2.simhash")).alias("distance"))\
    .agg(expr(f"sum({'+'.join([f'matching_buckets_{col_name}' for col_name in columns_to_compare])}) as matching_buckets_sum"))

# 显示结果
df.show()

10
+------------+------------+--------+
|document_id1|document_id2|distance|
+------------+------------+--------+
|document1   |document2   |12      |
|document1   |document3   |23      |
|document1   |document4   |29      |
|document2   |document3   |25      |
|document2   |document4   |35      |
|document3   |document4   |28      |
+------------+------------+--------+

+-----------+--------------------+--------------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+
|document_id|            document|             simhash|    bucket_0|    bucket_1|    bucket_2|    bucket_3|    bucket_4|    bucket_5|    bucket_6|    bucket_7|    bucket_8|    bucket_9|   bucket_10|   bucket_11|   bucket_12|   bucket_13|   bucket_14|   bucket_15|
+-----------+--------------------+--------------------+------------+------------+------------+----



+-----------+-----------+--------+--------------------+
|document_id|document_id|distance|matching_buckets_sum|
+-----------+-----------+--------+--------------------+
|  document1|  document2|      12|                   6|
|  document1|  document3|      23|                   2|
|  document1|  document4|      29|                   0|
|  document2|  document3|      25|                   5|
|  document2|  document4|      35|                   1|
|  document3|  document4|      28|                   4|
+-----------+-----------+--------+--------------------+





In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType, FloatType
from PIL import Image
import numpy as np

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSH").getOrCreate()

# 加载示例图像
image1 = Image.open("data/test1.png")
image2 = Image.open("data/test2.png")

# 将图像缩放为 32x32 并转换为灰度图像
image1 = image1.resize((32, 32)).convert("L")
image2 = image2.resize((32, 32)).convert("L")

# 将图像数据转换为 NumPy 数组
image1_array = np.array(image1)
image2_array = np.array(image2)

# 将图像数据展平为一维数组
image1_vector = image1_array.flatten().tolist()
image2_vector = image2_array.flatten().tolist()

print(len(image1_vector))

# 将图像数据转换为 DataFrame
df = spark.createDataFrame([(image1_vector,), (image2_vector,)], ["image"])

# 注册 DataFrame 为临时表
df.createOrReplaceTempView("image_data")

# 将图像数据转换为 SimHash 值
def simhash(image_vector):
    # 将像素值转换为 SimHash 值
    hashes = []
    for pixel in image_vector:
        pixel_hash = hash(pixel)
        # 使用 64 位 SimHash 值，将像素值的哈希值转换为二进制表示，并填充到 64 位
        binary_hash = format(pixel_hash, "064b")
        # 将二进制表示的哈希值转换为 DenseVector，每个元素值为 -1 或 1
        vector = [1 if b == "1" else -1 for b in binary_hash]
        hashes.append(vector)
    # 将所有单词的 SimHash 值合并到一个数组中
    simhash_value = [0] * 64
    for v in hashes:
        for i in range(64):
            simhash_value[i] += v[i]
    simhash_value = [1 if x > 0 else 0 for x in simhash_value]
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, ArrayType(IntegerType()))

# 对文档进行 SimHash 计算
df = df.withColumn("simhash", simhash_udf(df["image"]))

# 定义 Jaccard 相似度计算函数
def jaccard_similarity(simhash1, simhash2):
    length = len(simhash1)
    cnt = sum([1 for i in range(length) if simhash1[i] == simhash2[i]])
    return cnt / length

# 定义 Hamming Distance 函数
def hamming_distance(simhash1, simhash2):
    # 计算两个 SimHash 值的 Hamming 距离
    distance = sum([1 for i in range(64) if simhash1[i] != simhash2[i]])
    return distance

hamming_distance_udf = udf(hamming_distance, IntegerType())

# 计算 SimHash 值之间的 Jaccard 相似度
similar_documents = df.alias("a").join(df.alias("b"), col("a.image") < col("b.image")) \
    .select(col("a.image").alias("image1"),
            col("b.image").alias("image2"),
            jaccard_similarity_udf(col("a.simhash"), col("b.simhash")).alias("similarity"),
            hamming_distance_udf(col("a.simhash"), col("b.simhash")).alias("distance"))

df = df.alias("df1").crossJoin(df.alias("df2"))
df = df.filter(col("df1.image") < col("df2.image"))  # 排除与自身比较的情况
df = df.withColumn("distance", hamming_distance_udf(col("df1.simhash"), col("df2.simhash")))

similar_documents.show()

1024
+--------------------+--------------------+----------+--------+
|              image1|              image2|similarity|distance|
+--------------------+--------------------+----------+--------+
|[15, 19, 24, 18, ...|[107, 103, 103, 1...|  0.953125|       3|
+--------------------+--------------------+----------+--------+



In [5]:
# define hyperparameters for LSH
num_bands = 10
band_size = 8
threshold = 0.8

# define hash function for simhash
def hash_function(x, r):
    return (r * x + 1) % (2 ** 32)

# define simhash function
def simhash_feature_extraction(img_path, k=32):
    img = Image.open(img_path)
    img = img.resize((32, 32))  # resize to 32 x 32 pixels
    pixels = np.array(img).flatten()  # flatten to 1D array
    avg_pixel = pixels.mean()
    diff = pixels > avg_pixel
    simhash = np.zeros(k)
    for i in range(len(diff)):
        if diff[i]:
            simhash += hash_function(i, np.arange(k))
        else:
            simhash -= hash_function(i, np.arange(k))
    simhash[simhash >= 0] = 1
    simhash[simhash < 0] = -1
    return simhash

# define function for generating candidates
def generate_candidate_pairs(band_id, bands):
    candidates = set()
    for (img1, simhash1) in bands[band_id]:
        for (img2, simhash2) in bands[band_id]:
            print(simhash1.dot(simhash2))
            if img1 != img2 and simhash1.dot(simhash2) >= threshold:
                candidates.add((img1, img2))
    return candidates

def map_to_bands(x, band_size):
    result = []
    for i in range(0, len(x[1]), band_size):
        band = tuple(x[1][i:i+band_size])
        result.append(((i, band), x[0]))
    return result

In [6]:
# load CIFAR-10 dataset and extract simhash features
data_rdd = sc.parallelize(['data/test1.png', 'data/test2.png', 'data/test3.png'])
features_rdd = data_rdd.map(lambda x: (x, simhash_feature_extraction(x)))
bands_rdd = features_rdd.flatMap(lambda x: map_to_bands(x, band_size))
# bands_rdd = features_rdd.flatMap(lambda x: [((i, tuple(x[1][i:i+band_size])), x[0]) for i in range(0, len(x[1]), band_size)])
grouped_rdd = bands_rdd.groupByKey()
# generate LSH candidates
candidates_rdd = grouped_rdd.flatMap(lambda x: generate_candidate_pairs(x[0][0], list(x[1])))
print(features_rdd.collect())
print(bands_rdd.collect())
print(grouped_rdd.collect())
# print(candidates_rdd.collect())
# print candidate pairs
# candidates = candidates_rdd.collect()
# for candidate in candidates:
#     print(candidate)

# stop spark context
sc.stop()

                                                                                

[('data/test1.png', array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])), ('data/test2.png', array([ 1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1.])), ('data/test3.png', array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))]
[((0, (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), 'data/test1.png'), ((8, (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), 'data/test1.png'), ((16, (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), 'data/test1.png'), ((24, (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)), 'data/test1.png'), ((0, (1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0)), 'data/test2.png'), ((8, (-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0)), 'data/test2.png'), ((16, (-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0,

In [74]:
sc.stop()

In [2]:
print(sc.version)

3.3.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import HashingTF

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSH").getOrCreate()

# 创建示例数据
data = [
    ("document1", "The quick brown fox"),
    ("document2", "The lazy black dog"),
    ("document3", "The quick brown cat"),
    ("document4", "The lazy black cat"),
]
df = spark.createDataFrame(data, ["document_id", "document"])

# 定义 SimHash 函数
def simhash(document):
    # 将文档拆分为单词
    words = document.split(" ")

    # 计算每个单词的 SimHash 值
    hashes = []
    for word in words:
        word_hash = hash(word)
        # 使用 64 位 SimHash 值，将每个单词的哈希值转换为二进制表示，并填充到 64 位
        binary_hash = format(word_hash, "064b")
        # 将二进制表示的哈希值转换为 DenseVector，每个元素值为 -1 或 1
        vector = [1 if b == "1" else -1 for b in binary_hash]
        hashes.extend(vector)
    # 将所有单词的 SimHash 值合并到一个数组中
    simhash_value = DenseVector(hashes)
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, ArrayType(FloatType()))

# 对文档进行 SimHash 计算
df = df.withColumn("simhash", simhash_udf(df["document"]))

# 创建 Tokenizer 特征提取器
tokenizer = Tokenizer(inputCol="document", outputCol="words")

# 对文档进行分词
df = tokenizer.transform(df)

# 将 SimHash 值映射到桶(bucket)
num_buckets = 100 # 桶(bucket)的数量
hashingTF = HashingTF(inputCol="simhash", outputCol="hashed_features", numFeatures=num_buckets)
df = hashingTF.transform(df)

# 定义 Jaccard 相似度计算函数
def jaccard_similarity(set1, set2):
    set1 = set(set1)
    set2 = set(set2)
    intersection_size = len(set1.intersection(set2))
    union_size = len(set1) + len(set2) - intersection_size
    return intersection_size / union_size

# 将 Jaccard 相似度计算函数注册为 UDF
jaccard_similarity_udf = udf(jaccard_similarity, FloatType())

# 计算 Jaccard 相似度
df = df.alias("a").crossJoin(df.alias("b"))
df = df.filter("a.document_id < b.document_id")


In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml.linalg import DenseVector
from itertools import combinations

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSHExample").getOrCreate()

# 示例数据集
data = [("doc1", "The quick brown fox jumps over the lazy dog"),
        ("doc2", "The quick brown fox jumps over the quick dog"),
        ("doc3", "The slow brown fox jumps over the lazy dog"),
        ("doc4", "A brown fox jumps over a lazy dog")]

# 将数据集转换为 DataFrame
df = spark.createDataFrame(data, ["document_id", "document"])

# 定义 SimHash 函数
def simhash(document):
    # 将文档转换为哈希特征向量
    words = document.split(" ")
    words = [word for word in words if len(word) > 1]  # 过滤掉长度为 1 的单词
    words = words[:10]  # 只取前 10 个单词进行 SimHash 计算
    feature_vector = [0] * 64  # 初始化特征向量为全 0
    for word in words:
        # 计算单词的哈希值
        word_hash = hash(word)
        for i in range(64):
            # 统计每一位的哈希值的二进制位
            feature_vector[i] += 1 if (word_hash >> i) & 1 else -1
    # 将特征向量转换为 SimHash 值
    simhash_value = "".join(["1" if num >= 0 else "0" for num in feature_vector])
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, StringType())

# 对文档进行 SimHash 计算
df = df.withColumn("simhash", simhash_udf(df["document"]))

# 定义 Hamming Distance 函数
def hamming_distance(simhash1, simhash2):
    # 计算两个 SimHash 值的 Hamming 距离
    distance = sum([1 for i in range(64) if simhash1[i] != simhash2[i]])
    return distance

# 将 Hamming Distance 函数注册为 UDF
hamming_distance_udf = udf(hamming_distance, IntegerType())

# 注册 Hamming Distance UDF
spark.udf.register("hamming_distance_udf", hamming_distance_udf)

# 构建相似文档对的候选集
similar_docs = df.alias("d1").join(df.alias("d2"), "document_id") \
                .filter("d1.document_id < d2.document_id") \
                .selectExpr("d1.document_id as document_id1",
                            "d2.document_id as document_id2",
                            "hamming_distance_udf(d1.simhash, d2.simhash) as hamming_distance")

# 设置阈值，筛选出相似文档对
threshold = 3
similar_docs = similar_docs.filter(f"hamming_distance <= {threshold}")

# 显示相似的文档对
similar_docs.show()

AnalysisException: Column 'd2.document_id' does not exist. Did you mean one of the following? [d1.document_id, d2.document, d1.document, d2.simhash, d1.simhash]; line 1 pos 0;
'Project [document_id#270 AS document_id1#287, 'd2.document_id AS document_id2#288, hamming_distance_udf(simhash#275, simhash#281)#290 AS hamming_distance#289]
+- Project [document_id#270, document#271, simhash#275, document#280, simhash#281]
   +- Filter (document_id#270 < document_id#279)
      +- Project [document_id#270, document#271, simhash#275, document#280, simhash#281, document_id#279]
         +- Join Inner, (document_id#270 = document_id#279)
            :- SubqueryAlias d1
            :  +- Project [document_id#270, document#271, simhash(document#271)#274 AS simhash#275]
            :     +- LogicalRDD [document_id#270, document#271], false
            +- SubqueryAlias d2
               +- Project [document_id#279, document#280, simhash(document#280)#274 AS simhash#281]
                  +- LogicalRDD [document_id#279, document#280], false


In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType
from itertools import combinations

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSHExample").getOrCreate()

# 示例数据集
data = [("doc1", "The quick brown fox jumps over the lazy dog"),
        ("doc2", "The quick brown fox jumps over the quick dog"),
        ("doc3", "The slow brown fox jumps over the lazy dog"),
        ("doc4", "A brown fox jumps over a lazy dog")]

# 将数据集转换为 DataFrame
df = spark.createDataFrame(data, ["document_id", "document"])

# 定义 SimHash 函数
def simhash(document):
    # 将文档转换为哈希特征向量
    words = document.split(" ")
    words = [word for word in words if len(word) > 1]  # 过滤掉长度为 1 的单词
    words = words[:10]  # 只取前 10 个单词进行 SimHash 计算
    feature_vector = [0] * 64  # 初始化特征向量为全 0
    for word in words:
        # 计算单词的哈希值
        word_hash = hash(word)
        for i in range(64):
            # 统计每一位的哈希值的二进制位
            feature_vector[i] += 1 if (word_hash >> i) & 1 else -1
    # 将特征向量转换为 SimHash 值
    simhash_value = "".join(["1" if num >= 0 else "0" for num in feature_vector])
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, StringType())

# 对文档进行 SimHash 计算
df = df.withColumn("simhash", simhash_udf(df["document"]))

# 定义 Hamming Distance 函数
def hamming_distance(simhash1, simhash2):
    # 计算两个 SimHash 值的 Hamming 距离
    distance = sum([1 for i in range(64) if simhash1[i] != simhash2[i]])
    return distance

# 将 Hamming Distance 函数注册为 UDF
hamming_distance_udf = udf(hamming_distance, IntegerType())

# 注册 Hamming Distance UDF
spark.udf.register("hamming_distance_udf", hamming_distance_udf)

# 构建相似文档对的候选集
similar_docs = df.alias("d1").join(df.alias("d2")) \
                .filter("d1.document_id < d2.document_id")\
                .selectExpr("d1.document_id as document_id1",
                            "d2.document_id as document_id2",
                            "hamming_distance_udf(d1.simhash, d2.simhash) as hamming_distance")

# 设置阈值，筛选出相似文档对
threshold = 3
similar_docs = similar_docs.filter("hamming_distance <= {}".format(threshold))

similar_docs.show()

23/04/07 10:56:15 WARN SimpleFunctionRegistry: The function hamming_distance_udf replaced a previously registered function.
+------------+------------+----------------+
|document_id1|document_id2|hamming_distance|
+------------+------------+----------------+
+------------+------------+----------------+



