<a href="https://colab.research.google.com/github/MapariPrajwal/DataScience/blob/main/Similarity_Measures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Euclidean Distance

In [1]:
import numpy as np

def euclidean_distance(x, y):
    return np.sqrt(np.sum((np.array(x) - np.array(y))**2))

# Example usage:
x = [1, 2, 3]
y = [4, 5, 6]
print("Euclidean Distance:", euclidean_distance(x, y))

Euclidean Distance: 5.196152422706632


### Mahhattan Distance

In [2]:
def manhattan_distance(x, y):
    return np.sum(np.abs(np.array(x) - np.array(y)))

# Example usage:
print("Manhattan Distance:", manhattan_distance(x, y))

Manhattan Distance: 9


### Minkowski Distance

In [3]:
def minkowski_distance(x, y, p):
    return np.power(np.sum(np.abs(np.array(x) - np.array(y))**p), 1/p)

# Example usage:
p_value = 3
print("Minkowski Distance:", minkowski_distance(x, y, p_value))

Minkowski Distance: 4.3267487109222245


### Supremum Distance

In [4]:
def supremum_distance(x, y):
    return np.max(np.abs(np.array(x) - np.array(y)))

# Example usage:
print("Supremum Distance:", supremum_distance(x, y))

Supremum Distance: 3


### **Coefficients**

### Simple Matching Coefficient

In [5]:
def simple_matching_coefficient(a, b):
    common_elements = sum(ai == bi for ai, bi in zip(a, b))
    return common_elements / len(a)

# Example usage:
binary_a = [0, 1, 1, 0]
binary_b = [1, 1, 1, 0]
print("Simple Matching Coefficient:", simple_matching_coefficient(binary_a, binary_b))

Simple Matching Coefficient: 0.75


### Jaccard Coefficient

In [6]:
def jaccard_coefficient(a, b):
    intersection = sum(ai and bi for ai, bi in zip(a, b))
    union = sum(ai or bi for ai, bi in zip(a, b))
    return intersection / union

# Example usage:
print("Jaccard Coefficient:", jaccard_coefficient(binary_a, binary_b))


Jaccard Coefficient: 0.6666666666666666


### **Similarity by Distances**


### Cosine Similarity

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_text(doc1, doc2):
    vectorizer = CountVectorizer().fit_transform([doc1, doc2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0, 0]

# Example usage:
text1 = "This is a sample text."
text2 = "Sample text for cosine similarity."
print("Cosine Similarity:", cosine_similarity_text(text1, text2))

Cosine Similarity: 0.4472135954999579


### Jaro Distance

In [10]:
!pip install jellyfish

Collecting jellyfish
  Downloading jellyfish-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jellyfish
Successfully installed jellyfish-1.0.3


In [13]:
from jellyfish import jaro_distance
def calculate_jaro_distance(str1, str2):
    return jaro_distance(str1, str2)

# Example usage:
string1="Hello"
string2="Prajwal"
print("Jaro Distance:", calculate_jaro_distance(string1, string2))

ImportError: cannot import name 'jaro_distance' from 'jellyfish' (/usr/local/lib/python3.10/dist-packages/jellyfish/__init__.py)

### N Gram Distance

In [17]:
from nltk.util import ngrams

def ngram_distance(str1, str2, n):
    ngrams_str1 = set(ngrams(str1, n))
    ngrams_str2 = set(ngrams(str2, n))
    intersection = len(ngrams_str1.intersection(ngrams_str2))
    union = len(ngrams_str1.union(ngrams_str2))
    return 1 - intersection / union  # Convert to distance

# Example usage:
n_value = 2
string1="Hey"
string2="Sahil"
print("N-gram Distance:", ngram_distance(string1, string2, n_value))

N-gram Distance: 1.0
