# NAME-KARTIKI ASHOK HINGE
# ROLL  NO-03

# Simhash Algorithm

In [5]:
class SimHash:
    def __init__(self, tokens, hashbits=8):
        self.hashbits = hashbits
        self.hash = self.simhash(tokens)

    def _string_hash(self, v):
        # A simple string hashing function using MD5
        return int(hashlib.md5(v.encode('utf-8')).hexdigest(), 16)

    def simhash(self, tokens):
        v = [0] * self.hashbits  # Initialize a vector of size `hashbits`

        for token in tokens:
            # Compute the hash of the token
            token_hash = self._string_hash(token)

            for i in range(self.hashbits):
                # Check if the ith bit in the token's hash is set to 1
                bitmask = 1 << i
                if token_hash & bitmask:
                    v[i] += 1  # Increment if bit is 1
                else:
                    v[i] -= 1  # Decrement if bit is 0

        # Construct the SimHash fingerprint based on the sign of vector `v`
        fingerprint = 0
        for i in range(self.hashbits):
            if v[i] > 0:
                fingerprint |= 1 << i

        return fingerprint

    def hamming_distance(self, other):
        # Compute the Hamming distance between this hash and another SimHash
        x = self.hash ^ other.hash
        dist = 0
        while x:
            dist += 1
            x &= x - 1  # Clear the lowest set bit
        return dist

# Function to read and tokenize documents
def tokenize_document(doc):
    # Simple tokenization: split by spaces and convert to lowercase
    return doc.lower().split()

# Example Usage
doc1 = """ Internet of Things (IoT) refers to a network of things. These
 things could be any object that we see in our daily life. These
 objects are not limited to electronic devices or some high end
 technology products but could include objects that we wont
 normally think of them as electronic like dustbins, chairs,
 clothes etc."""
doc2 = """IoT stands for Internet of Things, which is a network of physical objects that are connected to the internet and can share data with other devices and systems. These objects can be anything from household items to industrial tools."""

# Tokenize the documents
tokens1 = tokenize_document(doc1)
tokens2 = tokenize_document(doc2)

# Compute SimHash for both documents
hash1 = SimHash(tokens1)
hash2 = SimHash(tokens2)

# Print the binary hashes and the Hamming distance
print(f"SimHash 1: {bin(hash1.hash)}")
print(f"SimHash 2: {bin(hash2.hash)}")
print(f"Hamming Distance: {hash1.hamming_distance(hash2)}")

SimHash 1: 0b1001011
SimHash 2: 0b11111000
Hamming Distance: 5
