### 准备

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType, FloatType, ArrayType
from pyspark.ml.feature import HashingTF

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSH").getOrCreate()

# 创建示例数据
data = [
    ("document1", "you could create a dataset of job descriptions and calculate the candidate overlap"),
    ("document2", "you could do a dataset of job descriptions or calculate the candidate overlap"),
    ("document3", "the document similarity is subjective and in the eyes of the client"),
    ("document4", "The lazy black cat"),
]
df = spark.createDataFrame(data, ["document_id", "document"])
dfA = spark.read.csv("../dataset/SICK_train.csv", sep="	", header=True).select("pair_ID", "sentence_A")
dfB = spark.read.csv("../dataset/SICK_train.csv", sep="	", header=True).select("pair_ID", "sentence_B")

def preprocessDocument(document):
    return document.strip().lower().replace("[^\\w\\s]", "").replace("\\s+", " ")

### 不分桶的版本

In [9]:
# 定义 SimHash 函数
def simhash(document):
    document = preprocessDocument(document)
    # 将文档拆分为单词
    words = document.split(" ")

    # 计算每个单词的 SimHash 值
    hashes = []
    for word in words:
        word_hash = hash(word)
        # 使用 64 位 SimHash 值，将每个单词的哈希值转换为二进制表示，并填充到 64 位
        binary_hash = format(word_hash, "064b")
        # 将二进制表示的哈希值转换为 DenseVector，每个元素值为 -1 或 1
        vector = [1 if b == "1" else -1 for b in binary_hash]
        hashes.append(vector)
    # 将所有单词的 SimHash 值合并到一个数组中
    simhash_value = [0] * 64
    for v in hashes:
        for i in range(64):
            simhash_value[i] += v[i]
    simhash_value = [1 if x > 0 else 0 for x in simhash_value]
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, ArrayType(IntegerType()))

# 对文档进行 SimHash 计算
dfA = dfA.withColumn("simhash_A", simhash_udf(dfA["sentence_A"]))
dfB = dfB.withColumn("simhash_B", simhash_udf(dfB["sentence_B"]))

# 定义相似度计算函数
def similarity(simhash1, simhash2):
    length = len(simhash1)
    cnt = sum([1 for i in range(length) if simhash1[i] == simhash2[i]])
    return cnt / length

# 定义 Hamming Distance 函数
def hamming_distance(simhash1, simhash2):
    # 计算两个 SimHash 值的 Hamming 距离
    distance = sum([1 for i in range(64) if simhash1[i] != simhash2[i]])
    return distance

# 将 Hamming Distance 函数注册为 UDF
hamming_distance_udf = udf(hamming_distance, IntegerType())

# 将相似度计算函数注册为 UDF
similarity_udf = udf(similarity, FloatType())

# 计算文档之间的相似度
similar_documents = dfA.join(dfB, 'pair_ID') \
    .select('pair_ID', 'sentence_A', 'sentence_B', 
            similarity_udf(col("simhash_A"), col("simhash_B")).alias("similarity"),
            hamming_distance_udf(col("simhash_A"), col("simhash_B")).alias("distance"))

# 设置阈值过滤出相似的文档
# threshold = 20
# similar_documents = similar_documents.filter(col("distance") < threshold)

# 打印相似文档
similar_documents.show(truncate=False)

+-------+------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+----------+--------+
|pair_ID|sentence_A                                                                                |sentence_B                                                                         |similarity|distance|
+-------+------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+----------+--------+
|1      |A group of kids is playing in a yard and an old man is standing in the background         |A group of boys in a yard is playing and a man is standing in the background       |0.890625  |7       |
|2      |A group of children is playing in the house and there is no man standing in the background|A group of kids is playing in a yard and an old man is standing in the backgroun

### 分桶的版本
一共n个桶，假设规定海明距离小于k的文本为相似文本，则两个数据相似的条件为至少n-k个桶中的内容相同

In [14]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import udf, col, split, concat_ws, substring, expr, when
from pyspark.sql.types import IntegerType, FloatType, ArrayType
from pyspark.ml.feature import HashingTF

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSH").getOrCreate()

# 创建示例数据
data = [
    ("document1", "you could create a dataset of job descriptions and calculate the candidate overlap"),
    ("document2", "you could do a dataset of job descriptions or calculate the candidate overlap"),
    ("document3", "the document similarity is subjective and in the eyes of the client"),
    ("document4", "The lazy black cat"),
]
df = spark.createDataFrame(data, ["document_id", "document"])
print(df.rdd.getNumPartitions())

# 对文档进行 SimHash 计算
dfA = dfA.withColumn("simhash_A", simhash_udf(dfA["sentence_A"]))
dfB = dfB.withColumn("simhash_B", simhash_udf(dfB["sentence_B"]))

# 设置距离的阈值
thres_distance = 5

# 将 SimHash 值映射到桶(bucket)
num_buckets = 16 # 桶(bucket)的数量

# 使用 split 函数将数组列拆分为多个列，并计算每列应包含的元素数量
array_len = len(dfA.first()['simhash_A'])
elements_per_bucket = int(array_len / num_buckets)

# 循环创建新的数组列，并使用 split 函数拆分源数组列中的元素
for i in range(num_buckets):
    # 计算拆分的起始和结束位置
    start = i * elements_per_bucket
    end = start + elements_per_bucket
    # 使用 split 函数拆分数组列，并为新列命名
    new_col_name = f"bucket_{i}"
    dfA = dfA.withColumn(f"{new_col_name}_A", expr(f"slice(simhash_A, {start + 1}, {end - start})"))
    dfB = dfB.withColumn(f"{new_col_name}_B", expr(f"slice(simhash_B, {start + 1}, {end - start})"))
# df.show()

# 要比较的buckets list
columns_to_compare = [f"bucket_{i}" for i in range(num_buckets)]

# 计算每两个documents之间有多少buckets相同
matching_columns_expr = [expr(f"CASE WHEN {col_name}_A = {col_name}_B THEN 1 ELSE 0 END as matching_buckets_{col_name}") for col_name in columns_to_compare]

# 使用 select 函数选择需要的列，并显示结果
# df_result = df.alias("df1").join(df.alias("df2"), col("df1.document_id") < col("df2.document_id"))\
#     .select("df1.document_id", "df2.document_id", *matching_columns_expr)
# df_result.show()

# 累加求和操作
matching_columns_sum_expr = "+".join([f"matching_buckets_{col_name}" for col_name in columns_to_compare])
# 使用 join 函数连接两个 DataFrame，并使用 groupBy 和 sum 函数进行累加求和操作
df = dfA.join(dfB, 'pair_ID')\
    .select('*', *matching_columns_expr)\
    .groupBy("pair_ID", hamming_distance_udf(col("simhash_A"), col("simhash_B")).alias("distance"))\
    .agg(expr(f"sum({'+'.join([f'matching_buckets_{col_name}' for col_name in columns_to_compare])}) as matching_buckets_sum"))

# 设置阈值过滤出相似的文档
threshold = num_buckets - thres_distance
df = df.filter(col("matching_buckets_sum") >= threshold)

# 显示结果
df.show()

10
+-------+--------+--------------------+
|pair_ID|distance|matching_buckets_sum|
+-------+--------+--------------------+
|   2957|       6|                  11|
|   7222|       5|                  11|
|   7556|       3|                  13|
|   1501|       7|                  11|
|   4163|       6|                  12|
|   6104|       5|                  11|
|   7839|       5|                  11|
|   9006|       4|                  13|
|   2675|       2|                  14|
|   2175|       2|                  14|
|   1048|       5|                  11|
|   1946|       3|                  13|
|   3627|       6|                  11|
|   9394|       6|                  11|
|    857|       6|                  11|
|   2828|       3|                  13|
|   5248|       6|                  11|
|   7287|       7|                  11|
|   9497|       4|                  13|
|    772|       9|                  11|
+-------+--------+--------------------+
only showing top 20 rows



### 图像相似度对比（不用了）

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType, FloatType
from PIL import Image
import numpy as np

# 创建 SparkSession
spark = SparkSession.builder.appName("SimHashLSH").getOrCreate()

# 加载示例图像
image1 = Image.open("data/test1.png")
image2 = Image.open("data/test2.png")

# 将图像缩放为 32x32 并转换为灰度图像
image1 = image1.resize((32, 32)).convert("L")
image2 = image2.resize((32, 32)).convert("L")

# 将图像数据转换为 NumPy 数组
image1_array = np.array(image1)
image2_array = np.array(image2)

# 将图像数据展平为一维数组
image1_vector = image1_array.flatten().tolist()
image2_vector = image2_array.flatten().tolist()

print(len(image1_vector))

# 将图像数据转换为 DataFrame
df = spark.createDataFrame([(image1_vector,), (image2_vector,)], ["image"])

# 注册 DataFrame 为临时表
df.createOrReplaceTempView("image_data")

# 将图像数据转换为 SimHash 值
def simhash(image_vector):
    # 将像素值转换为 SimHash 值
    hashes = []
    for pixel in image_vector:
        pixel_hash = hash(pixel)
        # 使用 64 位 SimHash 值，将像素值的哈希值转换为二进制表示，并填充到 64 位
        binary_hash = format(pixel_hash, "064b")
        # 将二进制表示的哈希值转换为 DenseVector，每个元素值为 -1 或 1
        vector = [1 if b == "1" else -1 for b in binary_hash]
        hashes.append(vector)
    # 将所有单词的 SimHash 值合并到一个数组中
    simhash_value = [0] * 64
    for v in hashes:
        for i in range(64):
            simhash_value[i] += v[i]
    simhash_value = [1 if x > 0 else 0 for x in simhash_value]
    return simhash_value

# 将 SimHash 函数注册为 UDF
simhash_udf = udf(simhash, ArrayType(IntegerType()))

# 对文档进行 SimHash 计算
df = df.withColumn("simhash", simhash_udf(df["image"]))

# 定义 Jaccard 相似度计算函数
def jaccard_similarity(simhash1, simhash2):
    length = len(simhash1)
    cnt = sum([1 for i in range(length) if simhash1[i] == simhash2[i]])
    return cnt / length

# 定义 Hamming Distance 函数
def hamming_distance(simhash1, simhash2):
    # 计算两个 SimHash 值的 Hamming 距离
    distance = sum([1 for i in range(64) if simhash1[i] != simhash2[i]])
    return distance

hamming_distance_udf = udf(hamming_distance, IntegerType())

# 计算 SimHash 值之间的 Jaccard 相似度
similar_documents = df.alias("a").join(df.alias("b"), col("a.image") < col("b.image")) \
    .select(col("a.image").alias("image1"),
            col("b.image").alias("image2"),
            jaccard_similarity_udf(col("a.simhash"), col("b.simhash")).alias("similarity"),
            hamming_distance_udf(col("a.simhash"), col("b.simhash")).alias("distance"))

df = df.alias("df1").crossJoin(df.alias("df2"))
df = df.filter(col("df1.image") < col("df2.image"))  # 排除与自身比较的情况
df = df.withColumn("distance", hamming_distance_udf(col("df1.simhash"), col("df2.simhash")))

similar_documents.show()

1024
+--------------------+--------------------+----------+--------+
|              image1|              image2|similarity|distance|
+--------------------+--------------------+----------+--------+
|[15, 19, 24, 18, ...|[107, 103, 103, 1...|  0.953125|       3|
+--------------------+--------------------+----------+--------+

