This notebook shows the implementation of different algorithms related to finding textually similar documents. Based on Jaccard similarity, we implement shingling, minhashing, and locality-sensitive hashing (LHS). 

Using a collection of NSF research award abstracts obtained from https://archive.ics.uci.edu/ml/datasets/NSF+Research+Award+Abstracts+1990-2003 we produce a collection of vectors containing hashed shingles based on k-grams of size 5 hashed using MurmurHash. We then use minhash with 100 different hashing algorithms by randomly generating coefficients. 

In [5]:
//input
val fileName1 = "dataset_abstracts/a9000038.txt"
val fileName2 = "dataset_abstracts/a9000040.txt"

val rows1 = sc.textFile(fileName1).map(line=>line.trim().replaceAll("(\\s)+", " ")).cache()
val text1 = rows1.reduce(_ + _)

val rows2 = sc.textFile(fileName2).map(line=>line.trim().replaceAll("(\\s)+", " ")).cache()
val text2 = rows2.reduce(_ + _)

fileName1 = dataset_abstracts/a9000038.txt
fileName2 = dataset_abstracts/a9000040.txt
rows1 = MapPartitionsRDD[2] at map at <console>:31
text1 = Title : Mathematical Sciences: Research on Optimal Stochastic Control and NonlinearEstimationType : AwardNSF Org : DMSLatestAmendmentDate : April 8, 1992File : a9000038Award Number: 9000038Award Instr.: Continuing grantPrgm Manager:DMS DIVISION OF MATHEMATICAL SCIENCESMPS DIRECT FOR MATHEMATICAL & PHYSICAL SCIENStart Date : July 1, 1990Expires : December 31, 1993 (Estimated)ExpectedTotal Amt. : $188574 (Estimated)Investigator: Wendell H. Fleming whf@cfm.brown.edu (Principal Investigator current)Sponsor : Brown University164 Angell StreetProvidence, RI 02912 401/863-2777NSF Program : 1266...


Title : Mathematical Sciences: Research on Optimal Stochastic Control and NonlinearEstimationType : AwardNSF Org : DMSLatestAmendmentDate : April 8, 1992File : a9000038Award Number: 9000038Award Instr.: Continuing grantPrgm Manager:DMS DIVISION OF MATHEMATICAL SCIENCESMPS DIRECT FOR MATHEMATICAL & PHYSICAL SCIENStart Date : July 1, 1990Expires : December 31, 1993 (Estimated)ExpectedTotal Amt. : $188574 (Estimated)Investigator: Wendell H. Fleming whf@cfm.brown.edu (Principal Investigator current)Sponsor : Brown University164 Angell StreetProvidence, RI 02912 401/863-2777NSF Program : 1266...

In [6]:
//k-gram
val k = 5
val k_gram1 = text1.split("").sliding(k).toList.map(x => (scala.util.hashing.MurmurHash3.arrayHash(x), ""))
val kgramRDD1 = sc.parallelize(k_gram1)
val filtered1 = kgramRDD1.distinct()

val k_gram2 = text2.split("").sliding(k).toList.map(x => (scala.util.hashing.MurmurHash3.arrayHash(x), ""))
val kgramRDD2 = sc.parallelize(k_gram2)
val filtered2 = kgramRDD2.distinct()


k = 5
k_gram1 = List((-1874370966,""), (1388959843,""), (-867864100,""), (-317192946,""), (-671188304,""), (1529582140,""), (2071593523,""), (-347942801,""), (-96038573,""), (1131329007,""), (2139931382,""), (-1615723425,""), (495118698,""), (-2011491347,""), (-1361060683,""), (834301842,""), (1335086780,""), (-954345614,""), (179541937,""), (1490961429,""), (-1847526272,""), (1142132048,""), (-938932799,""), (1873655578,""), (2035819669,""), (79787119,""), (949560031,""), (-147311775,""), (-1920259009,""), (338189782,""), (-1661430351,""), (-320725340,""), (1013651795,""), (-7031774,""), (-274139822,""), (-921542000,""), (-1675875172,""), (-2115819354,""), (857143427,""), (-1815694727,""), (1907031089,""), (1231827076,""), (-1156846470,""), (-1356510284,""), (...


List((-1874370966,""), (1388959843,""), (-867864100,""), (-317192946,""), (-671188304,""), (1529582140,""), (2071593523,""), (-347942801,""), (-96038573,""), (1131329007,""), (2139931382,""), (-1615723425,""), (495118698,""), (-2011491347,""), (-1361060683,""), (834301842,""), (1335086780,""), (-954345614,""), (179541937,""), (1490961429,""), (-1847526272,""), (1142132048,""), (-938932799,""), (1873655578,""), (2035819669,""), (79787119,""), (949560031,""), (-147311775,""), (-1920259009,""), (338189782,""), (-1661430351,""), (-320725340,""), (1013651795,""), (-7031774,""), (-274139822,""), (-921542000,""), (-1675875172,""), (-2115819354,""), (857143427,""), (-1815694727,""), (1907031089,""), (1231827076,""), (-1156846470,""), (-1356510284,""), (...

In [7]:

val join = filtered1.leftOuterJoin(filtered2)
val join_n = join.count().toFloat
val union = filtered1.union(filtered2).distinct()
val union_n = union.count().toFloat

val sim = join_n / union_n
val jacc_dist = 1-sim

println(sim)
println(jacc_dist)


0.6185781
0.38142192


join = MapPartitionsRDD[16] at leftOuterJoin at <console>:28
join_n = 1192.0
union = MapPartitionsRDD[20] at distinct at <console>:30
union_n = 1927.0
sim = 0.6185781
jacc_dist = 0.38142192


0.38142192

Now we work with Min-hashing

In [8]:
import java.io.File

val dir = new File("dataset_abstracts")
val files = dir.listFiles() //379
val maxSize = 10
val docSize = if(files.size < maxSize) files.size else maxSize

val docs = files.map(f => (f.getName(), sc.textFile(f.getPath() ).map(line=>line.replaceAll("(\\s)+", " ")).collect().reduce(_ + _)))
val docsRDD = sc.parallelize(docs).cache()
println(docsRDD.take(1)(0))

(a9000038.txt,Title : Mathematical Sciences: Research on Optimal Stochastic Control and Nonlinear EstimationType : AwardNSF Org : DMS LatestAmendmentDate : April 8, 1992 File : a9000038Award Number: 9000038Award Instr.: Continuing grant Prgm Manager:  DMS DIVISION OF MATHEMATICAL SCIENCES  MPS DIRECT FOR MATHEMATICAL & PHYSICAL SCIENStart Date : July 1, 1990 Expires : December 31, 1993 (Estimated)ExpectedTotal Amt. : $188574 (Estimated)Investigator: Wendell H. Fleming whf@cfm.brown.edu (Principal Investigator current)Sponsor : Brown University 164 Angell Street Providence, RI 02912 401/863-2777NSF Program : 1266 APPLIED MATHEMATICSFld Applictn: 0000099 Other Applications NEC  21 Mathematics Program Ref : Abstract : This research is part of an on-going program by the  principal investigator and associates. Topics in the following  areas are to be considered: (1) controlled Markov diffusions  and nonlinear PDEs; (2) asymptotic properties of nearly  deterministic Markov processes; (3) fin

dir = dataset_abstracts
files = Array(dataset_abstracts/a9000038.txt, dataset_abstracts/a9000040.txt, dataset_abstracts/a9000043.txt, dataset_abstracts/a9000045.txt, dataset_abstracts/a9000046.txt, dataset_abstracts/a9000048.txt, dataset_abstracts/a9000049.txt, dataset_abstracts/a9000050.txt, dataset_abstracts/a9000052.txt, dataset_abstracts/a9000053.txt, dataset_abstracts/a9000054.txt, dataset_abstracts/a9000057.txt, dataset_abstracts/a9000058.txt, dataset_abstracts/a9000060.txt, dataset_abstracts/a9000063.txt, dataset_abstracts/a9000075.txt, dataset_abstracts/a9000089.txt, dataset_abstracts/a9000091.txt, dataset_abstracts/a9000094.txt, dataset_abstracts/a9000099.txt, dataset_abstracts/a9000100.txt, dataset_abstracts/a9000102.txt, ...


Array(dataset_abstracts/a9000038.txt, dataset_abstracts/a9000040.txt, dataset_abstracts/a9000043.txt, dataset_abstracts/a9000045.txt, dataset_abstracts/a9000046.txt, dataset_abstracts/a9000048.txt, dataset_abstracts/a9000049.txt, dataset_abstracts/a9000050.txt, dataset_abstracts/a9000052.txt, dataset_abstracts/a9000053.txt, dataset_abstracts/a9000054.txt, dataset_abstracts/a9000057.txt, dataset_abstracts/a9000058.txt, dataset_abstracts/a9000060.txt, dataset_abstracts/a9000063.txt, dataset_abstracts/a9000075.txt, dataset_abstracts/a9000089.txt, dataset_abstracts/a9000091.txt, dataset_abstracts/a9000094.txt, dataset_abstracts/a9000099.txt, dataset_abstracts/a9000100.txt, dataset_abstracts/a9000102.txt, ...

In [9]:
val abstracts = docsRDD.map(doc => (doc._1,doc._2.split("Abstract :"))).filter(a => a._2.size > 1).map(a => (a._1, a._2(1)))

//TODO RDD[ RDD [String]] --> RDD[(docId, shingle)]
val k = 5
val k_gram = abstracts.map(a => (a._1, a._2.split("").sliding(k).toList.map(x => scala.util.hashing.MurmurHash3.arrayHash(x))))
println(k_gram)
val documents = k_gram.map(k => (k._1, k._2.distinct))
//val kgramRDD = sc.parallelize(kgram_flat)
//println(documents.take(1)(0))
//val filtered = kgram_flat.groupByKey().map(a => a._2.unique)
//println(filtered.take(1))

MapPartitionsRDD[1156] at map at <console>:34


abstracts = MapPartitionsRDD[1155] at map at <console>:30
k = 5
k_gram = MapPartitionsRDD[1156] at map at <console>:34
documents = MapPartitionsRDD[1157] at map at <console>:36


MapPartitionsRDD[1157] at map at <console>:36

In [68]:
//Min-hashing

val p = 1073676287
val m = p + 1
def hashThis(a:Long, b:Long, x:Long): Long = {
    ((a*x + b) % p ) % m
}
def generateRandomHashFunc(i:Int): (Long, Long) = {
    val r = new scala.util.Random(i)
    val a2 = r.nextInt(p-1)
    val a = if(a2%2==0) a2+1 else a2
    val b = r.nextInt(p-1)
    (a, b)
}
//var doc_signatures:List[List[(String, Long)]] = List.empty[List[(String, Long)]]

val n = 2
val hash_algos = (1 to n).map(i => generateRandomHashFunc(i))
val doc_signatures = hash_algos.map(h => {
    val min = documents.map(doc => (doc._1, doc._2.map(s => s.toLong).reduce((x,y) => hashThis(h._1, h._2, x) min hashThis(h._1, h._2, y) )))
    min
})
println(doc_signatures.take(1))


//compare signatures between different hashes

Vector(MapPartitionsRDD[1213] at map at <console>:55)


p = 1073676287
m = 1073676288
n = 2
hash_algos = Vector((495872699,215764588), (496449823,630021372))
doc_signatures = Vector(MapPartitionsRDD[1213] at map at <console>:55, MapPartitionsRDD[1214] at map at <console>:55)


hashThis: (a: Long, b: Long, x: Long)Long
generateRandomHashFunc: (i: Int)(Long, Long)


Vector(MapPartitionsRDD[1213] at map at <console>:55, MapPartitionsRDD[1214] at map at <console>:55)

In [None]:
for(i<-0 to 1){
    
    val r = new scala.util.Random(i)
    val a2 = r.nextInt(p-1)
    val a = if(a2%2==0) a2+1 else a2
    val b = r.nextInt(p-1)
    
    val min = documents.map(doc => (doc._1, doc._2.map(s => s.toLong).reduce((x,y) => hashThis(a, b, x) min hashThis(a, b, y) )))
    println(min.take(1)(0))
    //val min = filtered1.map(s => s._1.toLong).reduce((x,y) => hashThis(a, b, x) min hashThis(a, b, y) )
    
    //min.map(m => doc_signature :+ m)
    val collection = min.collect().toList
    doc_signatures = doc_signatures :+ collection
}

val docSignatures = hm.map { case (key,value) =>
  for (i <- 0 until value) {
    new MyObject(key, "a string", i)
  }}.toSeq
