<a href="https://colab.research.google.com/github/NateMophi/SCC-454/blob/main/LAB4/SCC454_Lab4_ipynbynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install required packages
!pip install pyspark==3.5.0 -q
!pip install sentence-transformers -q
!pip install numpy pandas -q

# Install Java (Spark requires Java)
!apt-get install openjdk-11-jdk-headless -qq > /dev/null

# Set Java environment variable
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

print("All packages installed successfully!")

All packages installed successfully!


In [3]:
import numpy as np
import hashlib
from typing import List, Set, Tuple

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import(
    col, udf, explode, array, lit, collect_list, size, lower, regexp_replace, split, monotonically_increasing_id, struct,
    when, coalesce, broadcast
)

from pyspark.sql.types import(
    ArrayType, StringType, IntegerType, FloatType, StructType, StructField,
    DoubleType
)

from pyspark.ml.feature import (
    HashingTF, CountVectorizer, MinHashLSH,
    Tokenizer, StopWordsRemover, NGram
)

from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT
from pyspark.ml import Pipeline



In [5]:
# Spark Session for LSH Operations
spark = SparkSession.builder.appName("SCC454-LocalitySensitiveHashing")\
.config("spark.driver.memory", "4g")\
.config("spark.sql.shuffle.partitions", "8")\
.config("spark.ui.port", "4050")\
.getOrCreate()

sc = spark.sparkContext
print(f"Spark Version: {spark.version}")
print(f"App Name: {spark.sparkContext.appName}")
print("\nSpark Session Ready for LSH ops!")

Spark Version: 3.5.0
App Name: SCC454-LocalitySensitiveHashing

Spark Session Ready for LSH ops!


## SAMPLE DATASET

In [6]:
# Sample document corpus with varying degrees of similarity
documents = [
    (0, "Machine learning is a subset of artificial intelligence that enables systems to learn from data."),
    (1, "Artificial intelligence and machine learning allow computers to learn from data automatically."),
    (2, "Deep learning is a type of machine learning using neural networks with many layers."),
    (3, "The weather today is sunny with a high of 25 degrees celsius."),
    (4, "Today's weather forecast shows sunny skies and temperatures around 25 degrees."),
    (5, "Natural language processing helps computers understand human language."),
    (6, "NLP enables machines to process and understand natural human language."),
    (7, "Python is a popular programming language for data science and machine learning."),
    (8, "Data science often uses Python programming for machine learning applications."),
    (9, "The cat sat on the mat and watched the birds outside the window."),
    (10, "A small cat was sitting on a mat, watching birds through the window."),
    (11, "Apache Spark provides distributed computing for big data processing."),
    (12, "Big data processing is made efficient through distributed computing with Spark."),
    (13, "Locality sensitive hashing enables fast approximate nearest neighbor search."),
    (14, "LSH provides fast approximate nearest neighbor queries using hashing techniques."),
    (15, "The restaurant serves delicious Italian pasta and fresh salads daily."),
]

df_docs = spark.createDataFrame(documents, ["id", "text"])

print("Sample Document Corpus:")
df_docs.show(truncate=60)

Sample Document Corpus:
+---+------------------------------------------------------------+
| id|                                                        text|
+---+------------------------------------------------------------+
|  0|Machine learning is a subset of artificial intelligence t...|
|  1|Artificial intelligence and machine learning allow comput...|
|  2|Deep learning is a type of machine learning using neural ...|
|  3|The weather today is sunny with a high of 25 degrees cels...|
|  4|Today's weather forecast shows sunny skies and temperatur...|
|  5|Natural language processing helps computers understand hu...|
|  6|NLP enables machines to process and understand natural hu...|
|  7|Python is a popular programming language for data science...|
|  8|Data science often uses Python programming for machine le...|
|  9|The cat sat on the mat and watched the birds outside the ...|
| 10|A small cat was sitting on a mat, watching birds through ...|
| 11|Apache Spark provides distributed

## **DOCUMENT SHINGLING**

---
# Part 2: Document Shingling
---

## 2.1 Understanding Shingling

**What is Shingling?**
Shingling converts documents into sets of contiguous subsequences (shingles). This allows us to measure document similarity by comparing these sets.

**Types of Shingles:**

| Type | Description | Example ("hello world") |
|------|-------------|-------------------------|
| Character k-shingles | Contiguous k characters | {"hel", "ell", "llo", "lo ", "o w", ...} |
| Word n-grams | Contiguous n words | {"hello world"} for n=2 |

**Choosing Shingle Size:**
- Too small: High overlap even for dissimilar documents
- Too large: Low overlap even for similar documents
- Rule of thumb: k=5-9 for characters, n=2-4 for words

In [7]:
# CHARACTER SHINGLES
def get_char_shingles(text:str, k:int =5):
  """Generate character k-shingles from text"""
  text = " ".join(text.lower().split())

  # Generate Shingles
  shingles = [text[i:i+k] for i in range(len(text) - k + 1)]
  return shingles

# Example
sample_text = "Hello World"
char_shingles = get_char_shingles(sample_text, k=5)
print(f"Text: '{sample_text}'")
print(f"5-character shingles: {char_shingles}")
print(f"Number of shingles: {len(char_shingles)}")
print(f"Unique shingles: {len(set(char_shingles))}")

Text: 'Hello World'
5-character shingles: ['hello', 'ello ', 'llo w', 'lo wo', 'o wor', ' worl', 'world']
Number of shingles: 7
Unique shingles: 7


In [8]:
# WORD SHINGLES
def get_word_shingles(text: str, n:int=3)-> List[str]:
  words = text.lower().split()
  shingles = [" ".join(words[i:i+n]) for i in range(len(words) - n + 1)]
  return shingles

  # Example
sample_text = "Machine learning is a subset of artificial intelligence"
word_shingles = get_word_shingles(sample_text, n=3)
print(f"Text: '{sample_text}'")
print(f"\n3-word shingles:")
for i, shingle in enumerate(word_shingles):
    print(f"  {i+1}. '{shingle}'")

Text: 'Machine learning is a subset of artificial intelligence'

3-word shingles:
  1. 'machine learning is'
  2. 'learning is a'
  3. 'is a subset'
  4. 'a subset of'
  5. 'subset of artificial'
  6. 'of artificial intelligence'
