In [1]:
import time
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.sql.types import DoubleType

# minhash

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import MinHashLSH
from pyspark.sql.functions import udf,col
from pyspark.sql.types import LongType, FloatType

import random

config = SparkConf().setAppName("LSH").setMaster("local[8]")
sc = SparkContext.getOrCreate(config)
spark = SparkSession(sc)

In [3]:
import re

SHINGLE_LENGTH = 3
PRIME = 2147483647
SIMILARITY_THRESHOLD = 0.001
BAND_SIMILARITY_THRESHOLD = 0.001
NUMBER_OF_BANDS = 10
NUMBER_OF_ROWS = 5
NUMBER_OF_HASHFUNCTIONS = 20

class minLSH:
    def __init__(self, numHashTables=5, shingleSize=5, inputCol="features", outputCol="hashes"):
        self.numHashTables = numHashTables
        self.shingleSize = shingleSize
        self.inputCol = inputCol
        self.outputCol = outputCol
        self.hashFunctions = self.getHashFunctions(self.numHashTables)

    def getHashFunctions(self, number):
        result = []
        rand = random.Random()
        for _ in range(number):
            a = rand.randint(0, PRIME-1)
            b = rand.randint(0, PRIME-1)
            result.append(lambda x, a=a, b=b: ((a * x + b) % PRIME))
        return result
    
    def preprocessDocument(self, document):
        return document.strip().lower().replace("[^\\w\\s]", "").replace("\\s+", " ") if isinstance(document, str) else document

    def shingle(self, document):
        resultingList = []
        i = 0
        while i + self.shingleSize < len(document):
            resultingList.append(hash(document[i:i+self.shingleSize]))
            i += 1
        return resultingList

    def minHash(self, listOfShingles):
        result = []
        for ind in range(len(self.hashFunctions)):
            minVal = float("inf")
            for shingle in listOfShingles:
                hashResult = self.hashFunctions[ind](shingle)
                if hashResult < minVal:
                    minVal = hashResult
            result.append(minVal)
        return result
    
    def signatureToHashedBandsOfRows(self, signature, numberOfBands, numberOfRowsInBand):
        if len(signature) != numberOfBands * numberOfRowsInBand:
            raise Exception("Wrong arguments number of bands times number of rows should equal length of signature")
        i = 0
        bands = []
        while i + numberOfRowsInBand <= len(signature):
            bands.append(signature[i:i+numberOfRowsInBand])
            i += numberOfRowsInBand
        return [hash(tuple(band)) for band in bands]

    def fit(self, dataframe):
        return self

    def transform(self, dataframe):
        preprocessDocument_udf = udf(self.preprocessDocument, StringType())
        df = dataframe.withColumn("document", preprocessDocument_udf(self.inputCol))
        # 分词
        shingle_udf = udf(self.shingle, ArrayType(LongType()))
        df = df.withColumn("shingles", shingle_udf("document")).drop("document")
        # MinHash
        minHash_udf = udf(lambda x: self.minHash(x), ArrayType(LongType()))
        df = df.withColumn("minHash", minHash_udf("shingles")).drop("shingles")
        return df

    def approxSimilarityJoin(self, dataframeA, dataframeB, threshold, joinID="id"):
        # 相似度 udf
        jaccard_similarity = udf(lambda x, y: len(
                    set(x).intersection(set(y))) / len(set(x).union(set(y))), FloatType())

        # 相似度
        similarities = dataframeA.alias("left").join(dataframeB.alias("right"), on=joinID, how="inner") \
            .withColumn("similarity", jaccard_similarity(col("left.minHash"), col("right.minHash")))  \
            .filter(col("similarity") >= threshold)

        return similarities

In [4]:
manual_threshold = udf(lambda x: 1 if x>=3 else 0)

In [5]:
dfA = spark.read.csv("sick/train.csv", header=True).select("pair_ID", "sentence_A").withColumnRenamed("sentence_A", "sentence").withColumnRenamed("pair_ID", "id")
dfB = spark.read.csv("sick/train.csv", header=True).select("pair_ID", "sentence_B", "relatedness_score").withColumnRenamed("sentence_B", "sentence").withColumnRenamed("pair_ID", "id")

dfB = dfB.withColumn("relatedness_score", dfB["relatedness_score"].cast(DoubleType()))
dfB = dfB.withColumn("is_duplicate", manual_threshold("relatedness_score"))

dfA.show(5)
dfB.show(5)

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  1|A group of kids i...|
|  2|A group of childr...|
|  3|The young boys ar...|
|  5|The kids are play...|
|  9|The young boys ar...|
+---+--------------------+
only showing top 5 rows

+---+--------------------+-----------------+------------+
| id|            sentence|relatedness_score|is_duplicate|
+---+--------------------+-----------------+------------+
|  1|A group of boys i...|              4.5|           1|
|  2|A group of kids i...|              3.2|           1|
|  3|The kids are play...|              4.7|           1|
|  5|A group of kids i...|              3.4|           1|
|  9|A group of kids i...|              3.7|           1|
+---+--------------------+-----------------+------------+
only showing top 5 rows



In [6]:
# num of partition 对时间的影响

for numPar in [5, 10, 20, 50]:
    A = dfA.repartition(numPar)
    B = dfB.repartition(numPar)

    model = minLSH(numHashTables=10, shingleSize=3, inputCol="sentence", outputCol="hashes")
    A = model.transform(A)
    B = model.transform(B)
    
    start_time = time.time()
    result = model.approxSimilarityJoin(A, B, 0, joinID="id")
    end_time = time.time()
    run_time = end_time - start_time
    
    print("Num of partition:", numPar, "; Time:", run_time)

Num of partition: 5 ; Time: 0.07942008972167969
Num of partition: 10 ; Time: 0.061505794525146484
Num of partition: 20 ; Time: 0.07738327980041504
Num of partition: 50 ; Time: 0.04362154006958008


In [7]:
# train set 的AUC

tmp = result.select("id", "is_duplicate", "similarity")
tmp = tmp.withColumn("is_duplicate", tmp["is_duplicate"].cast(DoubleType()))
metrics = BinaryClassificationMetrics(tmp.select("is_duplicate", "similarity").rdd.map(tuple))
metrics.areaUnderROC

                                                                                

0.6438155987559453