In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, FloatType, IntegerType
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, col

spark = SparkSession.builder.master("local").appName("minLSH").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/19 11:31:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/19 11:31:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
import random
from pyspark.sql.types import ArrayType, FloatType, IntegerType,StructType
from pyspark.ml.feature import HashingTF
import mmh3

# MinHashLSH

In [6]:
class minLSH:
    def __init__(self, numHashTables=5, shingleSize=5, inputCol="features", outputCol="hashes"):
        self.numHashTables = numHashTables
        self.shingleSize = shingleSize
        self.inputCol = inputCol
        self.outputCol = outputCol
        self.seed = None
        if not self.seed:
            self.seed = random.randint(0, 10000)
        random.seed(self.seed)
        a_vals = [random.randint(1, numHashTables * 10) for _ in range(numHashTables)]
        b_vals = [random.randint(0, numHashTables * 10) for _ in range(numHashTables)]
        m = 1 << 32 - 1

        self.hash_funcs = [lambda x, a=a, b=b: mmh3.hash(str(x) + str(a) + str(b)) % m
                                for a, b in zip(a_vals, b_vals)]


    def _hash_func(self, shingle, hash_func):
        hash_val = hash_func(shingle)
        return hash_val
    
    def _shingle(self, vec):
        shingles = set()
        if len(vec) < self.shingleSize:
            return list(vec)
        for i in range(len(vec) - self.shingleSize + 1):
            shingles.add(Vectors.dense(vec[i:i + self.shingleSize]))
        return list(shingles)

    def _min_hash(self, vec):
        shingles = self._shingle(vec)
        hash_values = []
        for hf in self.hash_funcs:
            min_hash = float('inf')
            for shingle in shingles:
                hash_val = self._hash_func(shingle, hf)
                if hash_val < min_hash:
                    min_hash = hash_val
            hash_values.append(min_hash)
        return Vectors.dense(hash_values)
    
  
    def fit(self, dataset):
        return self

    def transform(self, dataset):
        min_hash_udf = udf(self._min_hash, VectorUDT())
        dataset = dataset.withColumn(
            self.outputCol, min_hash_udf(col(self.inputCol)))
        return dataset

    def approxSimilarityJoin(self, datasetA, datasetB, threshold):
        jaccard_similarity = udf(lambda x, y: len(
            set(x).intersection(set(y))) / len(set(x).union(set(y))), FloatType())

        for col_name in datasetA.columns:
            datasetA = datasetA.withColumnRenamed(col_name, col_name + "_1")
        for col_name in datasetB.columns:
            datasetB = datasetB.withColumnRenamed(col_name, col_name + "_2")

        joined = datasetA.crossJoin(datasetB)
        joined = joined.withColumn("jaccard_similarity", jaccard_similarity(
            joined[self.outputCol + "_1"], joined[self.outputCol + "_2"]))
        result = joined.filter(joined["jaccard_similarity"] >= threshold)
        return result



In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [54]:
data = [(0, Vectors.dense([1.0, 1.0])),
        (1, Vectors.dense([1.0, 2.0])),
        (2, Vectors.dense([3.0, 2.0])),
        (3, Vectors.dense([2.0, 2.0])),
        (4, Vectors.dense([2.0, 3.0])),
        (5, Vectors.dense([3.0, 2.0])),
        ]

df = spark.createDataFrame(data, ["id", "features"])
min_lsh = minLSH(numHashTables=5, inputCol="features", outputCol="hashes")
min_lsh_model = min_lsh.fit(df)
transformed_df = min_lsh_model.transform(df)
transformed_df.show()
datasetA = transformed_df.filter(df['id'] <= 2)
datasetB = transformed_df.filter(df['id'] >= 3)
print(transformed_df.select("hashes").take(1))
result = min_lsh_model.approxSimilarityJoin(datasetA, datasetB, threshold=0)
result.show()


+---+---------+--------------------+
| id| features|              hashes|
+---+---------+--------------------+
|  0|[1.0,1.0]|[1.666456792E9,5....|
|  1|[1.0,2.0]|[1.666456792E9,5....|
|  2|[3.0,2.0]|[3.6419452E8,8.85...|
|  3|[2.0,2.0]|[1.874009501E9,8....|
|  4|[2.0,3.0]|[3.6419452E8,8.85...|
|  5|[3.0,2.0]|[3.6419452E8,8.85...|
+---+---------+--------------------+

[Row(hashes=DenseVector([1666456792.0, 547096464.0, 492682732.0, 408193507.0, 339579175.0]))]
23/04/16 19:51:57 WARN ExtractPythonUDFFromJoinCondition: The join condition:(<lambda>(hashes_1#7981, hashes_2#7993)#8005 >= 0.0) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


[Stage 53:>                                                         (0 + 1) / 1]

+----+----------+--------------------+----+----------+--------------------+------------------+
|id_1|features_1|            hashes_1|id_2|features_2|            hashes_2|jaccard_similarity|
+----+----------+--------------------+----+----------+--------------------+------------------+
|   0| [1.0,1.0]|[1.666456792E9,5....|   3| [2.0,2.0]|[1.874009501E9,8....|               0.0|
|   0| [1.0,1.0]|[1.666456792E9,5....|   4| [2.0,3.0]|[3.6419452E8,8.85...|               0.0|
|   0| [1.0,1.0]|[1.666456792E9,5....|   5| [3.0,2.0]|[3.6419452E8,8.85...|               0.0|
|   1| [1.0,2.0]|[1.666456792E9,5....|   3| [2.0,2.0]|[1.874009501E9,8....|               0.0|
|   1| [1.0,2.0]|[1.666456792E9,5....|   4| [2.0,3.0]|[3.6419452E8,8.85...|               0.0|
|   1| [1.0,2.0]|[1.666456792E9,5....|   5| [3.0,2.0]|[3.6419452E8,8.85...|               0.0|
|   2| [3.0,2.0]|[3.6419452E8,8.85...|   3| [2.0,2.0]|[1.874009501E9,8....|        0.11111111|
|   2| [3.0,2.0]|[3.6419452E8,8.85...|   4| [2.0,3

                                                                                

In [36]:
train = spark.read.csv('./mnist/mnist_train.csv', header=True, inferSchema=True)
columns = train.columns[1:]
# 合并为 vector
assembler = VectorAssembler(inputCols=columns, outputCol='features')
train = assembler.transform(train)
train = train.drop(*columns)
# 改为 DenseVector
toDense = lambda v: Vectors.dense(v.toArray())
toDenseUdf = udf(toDense, VectorUDT())
train = train.withColumn('features', toDenseUdf('features'))


                                                                                

In [6]:
train.select('features').show(5, truncate=False)

23/04/16 17:27:12 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 2:>                                                          (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

# Train

In [43]:
image_df = spark.createDataFrame(train.take(10), ["id", "features"])
min_lsh = minLSH(numHashTables=300, shingleSize=20, inputCol="features", outputCol="hashes")
min_lsh_model = min_lsh.fit(image_df)
transformed_df = min_lsh_model.transform(image_df)
transformed_df.show()

[Stage 41:>                                                         (0 + 1) / 1]

+---+--------------------+--------------------+
| id|            features|              hashes|
+---+--------------------+--------------------+
|  5|[0.0,0.0,0.0,0.0,...|[6809216.0,703145...|
|  0|[0.0,0.0,0.0,0.0,...|[4740876.0,170893...|
|  4|[0.0,0.0,0.0,0.0,...|[1.3840089E7,1371...|
|  1|[0.0,0.0,0.0,0.0,...|[2681899.0,612072...|
|  9|[0.0,0.0,0.0,0.0,...|[7251994.0,1.6084...|
|  2|[0.0,0.0,0.0,0.0,...|[2337324.0,940756...|
|  1|[0.0,0.0,0.0,0.0,...|[4841990.0,1.2461...|
|  3|[0.0,0.0,0.0,0.0,...|[1.9987955E7,2269...|
|  1|[0.0,0.0,0.0,0.0,...|[1.3771277E7,9414...|
|  4|[0.0,0.0,0.0,0.0,...|[9852210.0,100924...|
+---+--------------------+--------------------+



                                                                                

In [45]:
result = min_lsh_model.approxSimilarityJoin(transformed_df, transformed_df, threshold=0)
result.where("id_1=1").show()

23/04/16 19:38:32 WARN ExtractPythonUDFFromJoinCondition: The join condition:(<lambda>(hashes_1#7661, hashes_2#7673)#7685 >= 0.0) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.


[Stage 43:>                                                         (0 + 1) / 1]

+----+--------------------+--------------------+----+--------------------+--------------------+------------------+
|id_1|          features_1|            hashes_1|id_2|          features_2|            hashes_2|jaccard_similarity|
+----+--------------------+--------------------+----+--------------------+--------------------+------------------+
|   1|[0.0,0.0,0.0,0.0,...|[2681899.0,612072...|   5|[0.0,0.0,0.0,0.0,...|[6809216.0,703145...|               0.0|
|   1|[0.0,0.0,0.0,0.0,...|[2681899.0,612072...|   0|[0.0,0.0,0.0,0.0,...|[4740876.0,170893...|       0.001669449|
|   1|[0.0,0.0,0.0,0.0,...|[2681899.0,612072...|   4|[0.0,0.0,0.0,0.0,...|[1.3840089E7,1371...|       0.001669449|
|   1|[0.0,0.0,0.0,0.0,...|[2681899.0,612072...|   1|[0.0,0.0,0.0,0.0,...|[2681899.0,612072...|               1.0|
|   1|[0.0,0.0,0.0,0.0,...|[2681899.0,612072...|   9|[0.0,0.0,0.0,0.0,...|[7251994.0,1.6084...|               0.0|
|   1|[0.0,0.0,0.0,0.0,...|[2681899.0,612072...|   2|[0.0,0.0,0.0,0.0,...|[23373

                                                                                

In [5]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
data = spark.createDataFrame([
    ("1", "you could create a dataset of job descriptions and calculate the candidate overlap"),
    ("2", "you could do a dataset of job descriptions or calculate the candidate overlap"),
    ("3", "the document similarity is subjective and in the eyes of the client"),
    ("4", "The lazy black cat")
], ["id", "text"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(data)
wordsData.show()

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
featurizedData.show()

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()

+---+--------------------+--------------------+
| id|                text|               words|
+---+--------------------+--------------------+
|  1|you could create ...|[you, could, crea...|
|  2|you could do a da...|[you, could, do, ...|
|  3|the document simi...|[the, document, s...|
|  4|  The lazy black cat|[the, lazy, black...|
+---+--------------------+--------------------+

+---+--------------------+--------------------+--------------------+
| id|                text|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|  1|you could create ...|[you, could, crea...|(262144,[3564,586...|
|  2|you could do a da...|[you, could, do, ...|(262144,[58672,72...|
|  3|the document simi...|[the, document, s...|(262144,[8449,439...|
|  4|  The lazy black cat|[the, lazy, black...|(262144,[51504,95...|
+---+--------------------+--------------------+--------------------+

23/04/19 11:33:29 WARN DAGScheduler: Broadcasting large task b

In [9]:
# 改为 DenseVector
toDense = lambda v: Vectors.dense(v.toArray())
toDenseUdf = udf(toDense, VectorUDT())
trainData = rescaledData.withColumn('features', toDenseUdf('features'))

text_df = trainData.select('id', 'features')
min_lsh = minLSH(numHashTables=10, shingleSize=10, inputCol="features", outputCol="hashes")
min_lsh_model = min_lsh.fit(text_df)
transformed_df = min_lsh_model.transform(text_df)
transformed_df.show()

23/04/19 11:41:58 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB


                                                                                

+---+--------------------+--------------------+
| id|            features|              hashes|
+---+--------------------+--------------------+
|  1|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|
|  2|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|
|  3|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|
|  4|[0.0,0.0,0.0,0.0,...|[1.47850749E8,1.3...|
+---+--------------------+--------------------+



In [11]:
result = min_lsh_model.approxSimilarityJoin(transformed_df, transformed_df, threshold=0)
result.show()

23/04/19 11:42:47 WARN ExtractPythonUDFFromJoinCondition: The join condition:(<lambda>(hashes_1#263, hashes_2#275)#290 >= 0.0) of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.
23/04/19 11:42:48 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


                                                                                

+----+--------------------+--------------------+----+--------------------+--------------------+------------------+
|id_1|          features_1|            hashes_1|id_2|          features_2|            hashes_2|jaccard_similarity|
+----+--------------------+--------------------+----+--------------------+--------------------+------------------+
|   1|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|   1|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|               1.0|
|   1|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|   2|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|               1.0|
|   1|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|   3|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|               1.0|
|   1|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|   4|[0.0,0.0,0.0,0.0,...|[1.47850749E8,1.3...|              0.25|
|   2|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|   1|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|               1.0|
|   2|[0.0,0.0,0.0,0.0,...|[1.8381546E7,3.14...|   2|[0.0,0.0,0.0,0.0,...|[1.838

23/04/19 16:24:11 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1054191 ms exceeds timeout 120000 ms
23/04/19 16:24:11 WARN SparkContext: Killing executors is not supported by current scheduler.
23/04/19 17:40:36 WARN TransportChannelHandler: Exception in connection from /192.168.0.86:53866
java.io.IOException: Operation timed out
	at sun.nio.ch.FileDispatcherImpl.read0(Native Method)
	at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39)
	at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223)
	at sun.nio.ch.IOUtil.read(IOUtil.java:192)
	at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:380)
	at io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:258)
	at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
	at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:350)
	at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
	at io.netty.channel