In [1]:
# Question 1 (a)(b)

from itertools import combinations
from collections import Counter

# Define the sets and bags
sets = [
    {1, 2, 3, 4},
    {2, 3, 5, 7},
    {2, 4, 6}
]

bags = [
    [1, 1, 1, 2],
    [1, 1, 2, 2, 3],
    [1, 2, 3, 4]
]

# Function to compute Jaccard similarity between two sets
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

# Function to compute Jaccard bag similarity between two bags
def jaccard_bag_similarity(bag1, bag2):
    bag1_counter = Counter(bag1)
    bag2_counter = Counter(bag2)
    intersection_sum = sum((bag1_counter & bag2_counter).values())
    union_sum = sum((bag1_counter | bag2_counter).values())
    return intersection_sum / union_sum

# Compute Jaccard similarities for each pair of sets
jaccard_similarities = {
    f"Sets {i+1} and {j+1}": jaccard_similarity(pair[0], pair[1])
    for i, j in combinations(range(len(sets)), 2)
    for pair in [sorted([sets[i], sets[j]], key=id)]
}

# Compute Jaccard bag similarities for each pair of bags
jaccard_bag_similarities = {
    f"Bags {i+1} and {j+1}": jaccard_bag_similarity(bag1, bag2)
    for i, bag1 in enumerate(bags)
    for j, bag2 in enumerate(bags)
    if i < j
}

jaccard_similarities, jaccard_bag_similarities

({'Sets 1 and 2': 0.3333333333333333,
  'Sets 1 and 3': 0.4,
  'Sets 2 and 3': 0.16666666666666666},
 {'Bags 1 and 2': 0.5,
  'Bags 1 and 3': 0.3333333333333333,
  'Bags 2 and 3': 0.5})

# Question 1 (c)

1. **Intersection (\(|S  $\cap$  T|\))**: The expected size of the intersection is based on the probability of each element in \(U\) being chosen in both \(S\) and \(T\). Since each element has an equal chance of being selected, and each subset contains \(m\) elements, the expected intersection size can be calculated considering the binomial distribution of selecting \(m\) elements out of \(n\) twice independently.

2. **Union (\(|S $\cup$ T|\))**: The union of \(S\) and \(T\) can have at most \(2m\) elements (if \(S\) and \(T\) are entirely distinct) and at least \(m\) elements (if \(S = T\)). The expected size of the union depends on the overlap between \(S\) and \(T\), which is directly related to the size of their intersection.

3. **Expected Jaccard Similarity**: Combining the above, the expected Jaccard similarity can be formulated as a function of \(n\), \(m\), and the probability distributions governing the selection of elements into \(S\) and \(T\).

The exact calculation involves detailed combinatorial analysis, considering all possible ways \(m\) elements can be chosen twice from \(n\) elements and their intersection and union sizes. For large \(n\) and \(m\), approximations or simulations may be used to estimate this expected value.



In [2]:
# Question 2 (a)
# Define the sentence for shingling
sentence = "The most effective way to represent documents as sets, for the purpose of identifying lexically similar documents is to construct from the document the set of short strings that appear within it."

# Function to generate k-shingles from text
def generate_k_shingles(text, k):
    # Remove punctuation and make lowercase
    text = ''.join(c.lower() for c in text if c.isalnum() or c.isspace())
    # Split the text into words
    words = text.split()
    # Generate k-shingles
    shingles = [' '.join(words[i:i+k]) for i in range(len(words)-k+1)]
    return shingles

# Generate the first ten 3-shingles
first_ten_3_shingles = generate_k_shingles(sentence, 3)[:10]
first_ten_3_shingles


['the most effective',
 'most effective way',
 'effective way to',
 'way to represent',
 'to represent documents',
 'represent documents as',
 'documents as sets',
 'as sets for',
 'sets for the',
 'for the purpose']

In [3]:
# Question 2 (a)
def tokenize(sentence):
    """Simple tokenizer to split the sentence into words."""
    return sentence.split()

def identify_stop_words(words, max_length=3):
    """Identify stop words based on their length."""
    return [word for word in words if len(word) <= max_length]

def generate_shingles(words, stop_words):
    """Generate shingles starting with a stop word followed by the next two words."""
    shingles = []
    for i, word in enumerate(words):
        if word in stop_words:
            # Ensure there are at least two words following the stop word
            if i + 2 < len(words):
                shingle = " ".join(words[i:i+3])
                shingles.append(shingle)
    return shingles

# The sentence to analyze
sentence = "The most effective way to represent documents as sets, for the purpose of identifying lexically similar documents is to construct from the document the set of short strings that appear within it."

# Tokenize the sentence
words = tokenize(sentence.lower())  # Convert to lowercase to ensure consistency

# Identify stop words
stop_words = identify_stop_words(words)

# Generate shingles
shingles = generate_shingles(words, stop_words)

# Display the generated shingles
for shingle in shingles:
    print(shingle)

the most effective
way to represent
to represent documents
as sets, for
for the purpose
the purpose of
of identifying lexically
is to construct
to construct from
the document the
the set of
set of short
of short strings


In [4]:
# Question 2 (b)
# Define a function to generate stop-word-based shingles
def generate_stop_word_based_shingles(text, stop_word_length=3):
    # Define stop words as words of three or fewer letters
    stop_words = set(word for word in text.split() if len(word) <= stop_word_length)
    
    # Initialize a list to hold the shingles
    shingles = []
    words = text.split()
    
    # Generate shingles
    for i, word in enumerate(words):
        if word in stop_words:
            # Extract a shingle starting with the stop word and the next two words, if available
            shingle = ' '.join(words[i:i+3])
            shingles.append(shingle)
    
    # Remove shingles that don't start with a stop word followed by exactly two non-stop words
    shingles = [shingle for shingle in shingles if len(shingle.split()) == 3 and shingle.split()[0] in stop_words]
    return shingles

# Generate the stop-word-based shingles for the given sentence
sentence = "The most effective way to represent documents as sets, for the purpose of identifying lexically similar documents is to construct from the document the set of short strings that appear within it."
stop_word_based_shingles = generate_stop_word_based_shingles(sentence)
stop_word_based_shingles


['The most effective',
 'way to represent',
 'to represent documents',
 'as sets, for',
 'for the purpose',
 'the purpose of',
 'of identifying lexically',
 'is to construct',
 'to construct from',
 'the document the',
 'the set of',
 'set of short',
 'of short strings']

# Question 2 (c)
The largest number of k-shingles a document of n bytes can have is when every consecutive substring of length k is unique.
This is achieved when each new byte after the first k bytes forms a new k-shingle.
Therefore, the formula for the largest number of k-shingles is n - k + 1, assuming n >= k.

Since we are not solving for a specific value but providing a general formula, we display the formula directly.

In [5]:
# Question 3
import numpy as np
from itertools import permutations

# The matrix representation of sets S1, S2, S3, and S4.
matrix = np.array([
    [1, 0, 0, 1],  # Element a
    [0, 0, 1, 0],  # Element b
    [0, 1, 0, 1],  # Element c
    [1, 0, 1, 1],  # Element d
    [0, 0, 1, 0]   # Element e
])

# Function to compute Jaccard similarity between two sets represented as binary vectors
def _jaccard_similarity(set1, set2):
    intersection = np.sum(np.logical_and(set1, set2))
    union = np.sum(np.logical_or(set1, set2))
    return intersection / union

# Compute Jaccard similarity for each pair of columns (sets)
jaccard_similarities = {
    (i, j): _jaccard_similarity(matrix[:, i], matrix[:, j])
    for i in range(matrix.shape[1])
    for j in range(matrix.shape[1]) if i < j
}

# Function to get the hash value for a given column and permutation
def hash_value(column, perm):
    # Apply the permutation and get the hash value
    permuted_column = column[list(perm)]
    for row_index in range(len(permuted_column)):
        if permuted_column[row_index] == 1:
            return row_index
    return -1

# Compute the fraction of permutations that hash to the same value for each pair of columns
num_rows = matrix.shape[0]
all_permutations = list(permutations(range(num_rows)))
matching_permutations = {
    (i, j): sum(1 for perm in all_permutations if hash_value(matrix[:, i], perm) == hash_value(matrix[:, j], perm)) / len(all_permutations)
    for i in range(matrix.shape[1])
    for j in range(matrix.shape[1]) if i < j
}

print(f"Jaccard similarity: \n {jaccard_similarities}, \nThe fraction of the 120 permutations: \n {matching_permutations}")


Jaccard similarity: 
 {(0, 1): 0.0, (0, 2): 0.25, (0, 3): 0.6666666666666666, (1, 2): 0.0, (1, 3): 0.3333333333333333, (2, 3): 0.2}, 
The fraction of the 120 permutations: 
 {(0, 1): 0.0, (0, 2): 0.25, (0, 3): 0.6666666666666666, (1, 2): 0.0, (1, 3): 0.3333333333333333, (2, 3): 0.2}


In [6]:
# Question 4
# Given matrix for the question
matrix = np.array([
    [0, 1, 0, 1],  # Element 0
    [0, 1, 0, 0],  # Element 1
    [1, 0, 0, 1],  # Element 2
    [0, 0, 1, 0],  # Element 3
    [0, 0, 1, 1],  # Element 4
    [1, 0, 0, 0]   # Element 5
])

# Define the hash functions
def h1(x):
    return (2*x + 1) % 6

def h2(x):
    return (3*x + 2) % 6

def h3(x):
    return (5*x + 2) % 6

# Function to compute the minhash signature of a column
def minhash(column, hash_functions):
    signature = []
    for hash_function in hash_functions:
        # Apply hash function to each row and take the minimum hash value where the column has 1
        min_hash_value = min(hash_function(i) for i in range(len(column)) if column[i] == 1)
        signature.append(min_hash_value)
    return signature

# Compute the minhash signature for each column
hash_functions = [h1, h2, h3]
minhash_signatures = {f"Column {i+1}": minhash(matrix[:, i], hash_functions) for i in range(matrix.shape[1])}

# Check which hash functions are true permutations of the rows
def is_permutation(hash_function, num_rows):
    # A hash function is a true permutation if it produces all unique values when applied to all rows
    hashed_values = [hash_function(i) for i in range(num_rows)]
    return len(set(hashed_values)) == num_rows

# Verify the hash functions
permutations_check = {
    "h1": is_permutation(h1, matrix.shape[0]),
    "h2": is_permutation(h2, matrix.shape[0]),
    "h3": is_permutation(h3, matrix.shape[0])
}

# To answer part (c), we need the true Jaccard similarities first, for comparison
# Calculate the true Jaccard similarities for all pairs of columns
true_jaccard_similarities = {
    (i, j): _jaccard_similarity(matrix[:, i], matrix[:, j])
    for i in range(matrix.shape[1])
    for j in range(matrix.shape[1]) if i < j
}

# Function to estimate Jaccard similarity using minhash signatures
def estimated_jaccard(signature1, signature2):
    return sum(1 for i in range(len(signature1)) if signature1[i] == signature2[i]) / len(signature1)

# Calculate the estimated Jaccard similarities for all pairs of columns using the minhash signatures
estimated_jaccard_similarities = {
    (i, j): estimated_jaccard(minhash(matrix[:, i], hash_functions), minhash(matrix[:, j], hash_functions))
    for i in range(matrix.shape[1])
    for j in range(matrix.shape[1]) if i < j
}
print(f"(a) minhash_signatures: {minhash_signatures} \n(b)permutations_check: {permutations_check} \n(c)true_jaccard_similarities: {true_jaccard_similarities}, \nestimated_jaccard_similarities: {estimated_jaccard_similarities}")



(a) minhash_signatures: {'Column 1': [5, 2, 0], 'Column 2': [1, 2, 1], 'Column 3': [1, 2, 4], 'Column 4': [1, 2, 0]} 
(b)permutations_check: {'h1': False, 'h2': False, 'h3': True} 
(c)true_jaccard_similarities: {(0, 1): 0.0, (0, 2): 0.0, (0, 3): 0.25, (1, 2): 0.0, (1, 3): 0.25, (2, 3): 0.25}, 
estimated_jaccard_similarities: {(0, 1): 0.3333333333333333, (0, 2): 0.3333333333333333, (0, 3): 0.6666666666666666, (1, 2): 0.6666666666666666, (1, 3): 0.6666666666666666, (2, 3): 0.6666666666666666}


# Question 5
Minhashing is a technique that converts sets to a smaller representation or "signature", preserving similarity. A Minhash value for a set corresponds to the index of the first non-zero element in its characteristic bit vector when the elements have been permuted randomly. 

Given a bit vector `x` of length `n` with `m` zeros, the min-hash value `h(x)` is determined by the position of the first '1' following a permutation of the vector. 

Since there are `m` zeros, the first '1' can be at the first position if all zeros are after it, which is the best-case scenario, yielding `h(x) = 1`. In the worst case, all `m` zeros precede the first '1', making it at the position `m + 1`. Therefore, the min-hash value of this vector, `h(x)`, will always satisfy `1 ≤ h(x) ≤ m + 1`. 

This property of min-hash arises because we are looking for the first '1' in the permuted vector. No matter how we permute the entries, if there are `m` zeros, then there will be `m` positions that the first '1' cannot occupy, which are the positions filled by these zeros. Thus, the first '1' must be within the first `m + 1` entries of the permuted bit vector.


In [7]:
# QUESTION 6
import numpy as np
from scipy.optimize import brentq

# S-curve function
def s_curve(s, r, b):
    return 1 - (1 - s**r)**b

# Values of s to evaluate
s_values = np.arange(0.1, 1.0, 0.1)

# Pairs of (r, b) values
rb_pairs = [(3, 10), (6, 20), (5, 50)]

# Evaluate the S-curve for each s and each (r, b) pair
s_curve_results = {rb: [s_curve(s, *rb) for s in s_values] for rb in rb_pairs}

# Compute the threshold for each (r, b) pair
def find_threshold(r, b):
    # Define the equation for which we want to find the root
    equation = lambda s: s_curve(s, r, b) - 0.5
    # Find the root
    threshold = brentq(equation, 0.0, 1.0)
    return threshold

# Compute thresholds
thresholds = {rb: find_threshold(*rb) for rb in rb_pairs}

print("S-curve results:", s_curve_results)
print("Thresholds:", thresholds)


S-curve results: {(3, 10): [0.009955119790251765, 0.07718058804273675, 0.23944889319887064, 0.4838707317677322, 0.7369244238361716, 0.9122674753991765, 0.985015105295655, 0.9992340538808936, 0.9999978635491371], (6, 20): [1.9999810001669616e-05, 0.001279222058761964, 0.014479466504172311, 0.07880932311056232, 0.27018714400947597, 0.6154146360312677, 0.9181859965846744, 0.9977121251546806, 0.9999997398129465], (5, 50): [0.0004998775195954597, 0.01587519984502117, 0.11453988231042189, 0.4022839522088044, 0.7955506304323648, 0.9825338277068608, 0.9998989958361557, 0.9999999976077777, 1.0]}
Thresholds: {(3, 10): 0.4060881340677082, (6, 20): 0.5693533868256984, (5, 50): 0.42439448036873306}


# Question 7
The properties of a metric on distance measure, which includes:

1. **Non-negativity**: $JaccardDistance(A, B) \geq 0$ for any sets $A$ and $B$.
2. **Identity of indiscernibles**: $JaccardDistance(A, B) = 0$ if and only if $A = B$.
3. **Symmetry**: $JaccardDistance(A, B) = JaccardDistance(B, A)$ for any sets $A$ and $B$.
4. **Triangle inequality**: $JaccardDistance(A, C) \leq JaccardDistance(A, B) + JaccardDistance(B, C)$ for any sets $A$, $B$, and $C$.

The Jaccard distance is defined as $JaccardDistance(A, B) = 1 - JaccardSimilarity(A, B)$, where the Jaccard similarity, $JaccardSimilarity(A, B)$, is $\frac{|A \cap B|}{|A \cup B|}$. 

Now, let's show that Jaccard distance satisfies these four properties:

1. **Non-negativity**: The intersection of two sets will never have more elements than their union, so $|A \cap B| \leq |A \cup B|$. Hence, the Jaccard similarity is always between 0 and 1, and the Jaccard distance, being $1 - JaccardSimilarity$, is also between 0 and 1. Therefore, $JaccardDistance(A, B) \geq 0$.

2. **Identity of indiscernibles**: If $A = B$, then $|A \cap B| = |A| = |B|$ and $|A \cup B| = |A| = |B|$, so $JaccardSimilarity(A, B) = 1$ and thus $JaccardDistance(A, B) = 0$. Conversely, if $JaccardDistance(A, B) = 0$, this implies that $JaccardSimilarity(A, B) = 1$, which means $|A \cap B| = |A \cup B|$, and therefore $A = B$.

3. **Symmetry**: Since the intersection and union of sets are commutative operations, $|A \cap B| = |B \cap A|$ and $|A \cup B| = |B \cup A|$, it follows that $JaccardSimilarity(A, B) = JaccardSimilarity(B, A)$. Thus, $JaccardDistance(A, B) = JaccardDistance(B, A)$.

4. **Triangle inequality**: This is the most complex property to prove and generally requires a more involved argument. To satisfy the triangle inequality, we need to show that for any sets $A$, $B$, and $C$, the Jaccard distance between $A$ and $C$ is less than or equal to the sum of the Jaccard distances between $A$ and $B$, and $B$ and $C$.

Hence, the Jaccard distance satisfies all four properties of a metric space and is, therefore, a distance measure.



# Question 8
To prove that for any positive integers $i$ and $j$ where $i < j$, the $L_i$ norm between any two points is greater than the $L_j$ norm between those same two points, let's consider two points $P$ and $Q$ in a space with $n$ dimensions. The coordinates of $P$ are given by $p_1, p_2, \ldots, p_n$, and the coordinates of $Q$ are given by $q_1, q_2, \ldots, q_n$.

The $L_i$ norm (or $L_i$ distance) between points $P$ and $Q$, denoted as $||P - Q||_i$, is defined as:

$$ ||P - Q||_i = \left(\sum_{k=1}^{n} |p_k - q_k|^i\right)^{1/i} $$

Similarly, the $L_j$ norm between $P$ and $Q$ is:

$$ ||P - Q||_j = \left(\sum_{k=1}^{n} |p_k - q_k|^j\right)^{1/j} $$

Given that $i < j$, we aim to show that:

$$ ||P - Q||_i > ||P - Q||_j $$

This statement can be proven by understanding the effect of raising the absolute differences $|p_k - q_k|$ to higher powers. When $i < j$, the power $j$ amplifies larger differences more significantly than the power $i$, making the overall $L_j$ distance smaller than the $L_i$ distance due to the "smoothing" effect of raising differences to a higher power.

More formally, this can be understood by considering the properties of $p$-norms:

1. **Non-negativity and Definiteness**: Both $||P - Q||_i$ and $||P - Q||_j$ are non-negative and are zero if and only if $P = Q$.

2. **Symmetry**: $||P - Q||_i = ||Q - P||_i$ and $||P - Q||_j = ||Q - P||_j$.

3. **Triangle Inequality**: For any three points $P$, $Q$, and $R$, the triangle inequality holds for both $L_i$ and $L_j$ norms.

4. **Homogeneity**: $||\lambda(P - Q)||_i = |\lambda|||P - Q||_i$ and $||\lambda(P - Q)||_j = |\lambda|||P - Q||_j$.

To prove that $||P - Q||_i > ||P - Q||_j$, we leverage the fact that for $i < j$, the function $f(x) = x^{j/i}$ is convex for $x \geq 0$. By applying Jensen's inequality, we find that for a set of positive numbers, the inequality:

$$ \left(\sum |p_k - q_k|^i\right)^{j/i} > \sum |p_k - q_k|^j $$

holds, leading us to conclude that $||P - Q||_i > ||P - Q||_j$ under the given conditions.



In [8]:
# Question 9
# Base sensitivity parameters
p1_base, p2_base, n1_base, n2_base = 0.25, 0.75, 0.75, 0.25

# Step 1: (4, 3) OR-AND construction
p1_OR = 1 - (1 - n1_base) ** 4
p2_OR = 1 - (1 - n2_base) ** 4
n1_AND = p1_OR ** 3
n2_AND = p2_OR ** 3

# Step 2: (3, 4) AND-OR construction applied to the results of step 1
n1_AND_2 = n1_AND ** 3
n2_AND_2 = n2_AND ** 3
p1_OR_2 = 1 - (1 - n1_AND_2) ** 4
p2_OR_2 = 1 - (1 - n2_AND_2) ** 4

# Total number of base hash functions needed
total_base_hash_functions = 12 * 12  # 12 from the first construction applied 12 times in the second construction

(p1_OR_2, p2_OR_2, n1_AND_2, n2_AND_2, total_base_hash_functions)


(0.9999985648333911,
 0.12415142128492862,
 0.9653880888384516,
 0.03259738413767983,
 144)

In [9]:
# Question 10
import numpy as np

# Define the "random" vectors and the given vectors
random_vectors = np.array([[1, 1, 1, -1], [1, 1, -1, 1], [1, -1, 1, 1], [-1, 1, 1, 1]])
vectors = {
    "a": np.array([2, 3, 4, 5]),
    "b": np.array([-2, 3, -4, 5]),
    "c": np.array([2, -3, 4, -5])
}

# Compute sketches
def compute_sketch(vector, random_vectors):
    return [1 if np.dot(vector, rv) >= 0 else 0 for rv in random_vectors]

sketches = {key: compute_sketch(vec, random_vectors) for key, vec in vectors.items()}

# Estimate angles between pairs of vectors based on sketches
def estimate_angle(sketch1, sketch2):
    hamming_distance = sum(x != y for x, y in zip(sketch1, sketch2))
    fraction_of_disagreement = hamming_distance / len(sketch1)
    estimated_angle = np.arccos(1 - 2 * fraction_of_disagreement)
    return np.degrees(estimated_angle)

# True angles between pairs of vectors
def true_angle(vec1, vec2):
    cosine_similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return np.degrees(np.arccos(cosine_similarity))

# Calculate estimated and true angles
pair_keys = ["ab", "ac", "bc"]
angles = {key: {} for key in pair_keys}
for pair in pair_keys:
    angles[pair]["estimated"] = estimate_angle(sketches[pair[0]], sketches[pair[1]])
    angles[pair]["true"] = true_angle(vectors[pair[0]], vectors[pair[1]])

sketches, angles


({'a': [1, 1, 1, 1], 'b': [0, 1, 0, 1], 'c': [1, 0, 1, 0]},
 {'ab': {'estimated': 90.0, 'true': 74.97388624088055},
  'ac': {'estimated': 90.0, 'true': 105.02611375911947},
  'bc': {'estimated': 180.0, 'true': 180.0}})

# Question 11
Given the task of performing entity resolution among bibliographic references, we encounter a scenario where:
- All references include a year of publication, equally likely to be any of the ten most recent years.
- Pairs of references with a perfect score have an average difference in publication year of 0.1.
- Pairs of references with a certain score \(s\) have an average difference in publication dates of 2.

We aim to calculate the fraction of pairs with score \(s\) that truly represent the same publication.

### Step 1: Expected Average Difference in Publication Year Among Random Pairs

Given a uniform distribution of publication years, the expected difference \($E_{\text{random}}$\) is calculated as:

$$
E_{\text{random}} = \frac{\sum_{d=1}^{9} d \cdot (10-d)}{\sum_{d=1}^{9} (10-d)}
$$

This formula accounts for the fact that differences near the middle of the range are more common than those near the ends.

### Step 2: Fraction of Pairs with Score \(s\) Representing the Same Publication

Let \( $E_{\text{perfect}} = 0.1 $ \) represent the average year difference for pairs with a perfect score, and \( $ E_s = 2 $ \) for pairs with score \(s\). We seek the fraction \(f\) of score \(s\) pairs that are the same publication, balancing the equation:

$$
f \cdot E_{\text{perfect}} + (1 - f) \cdot E_{\text{random}} = E_s
$$

Solving for \(f\) gives:

$$
f = \frac{E_s - E_{\text{random}}}{E_{\text{perfect}} - E_{\text{random}}}
$$

### Calculations

- First, calculate \( $E_{\text{random}}$ \) using the provided formula.
- Then, solve for \(f\) using the values for \( $E_{\text{perfect}}$ \), \( $E_s$ \), and your calculated \( $E_{\text{random}}$ \).

This methodology allows us to estimate the fraction of bibliographic reference pairs with score \(s\) that likely represent identical publications based on the given data and statistical reasoning.


In [None]:
# Question 12
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Distance Measures") \
    .getOrCreate()

sc = spark.sparkContext
