## Euclidean Distance Vs. Cosine Similarity

In [1]:
import numpy as np
import pandas as pd

### Programming the distance/similarity metrics

In [2]:
#Euclidean Distance

def euclidean_distance(x, y):   
    return np.sqrt(np.sum((x - y) ** 2))

In [3]:
#Cosine Similarity

def cosine_similarity(x, y):
    return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))

### Calculating Similarities

In [4]:
#Example array

x1 = np.array([1,1])
x2 = np.array([3,2])

In [5]:
#Euclidean distance between x1 and x2

euclidean_distance(x1, x2)

2.23606797749979

In [6]:
#Cosine Similarity between x1 and x2

cosine_similarity(x1, x2)

0.98058067569092

### An important example

In [7]:
x3 = x2*2

In [8]:
#Euclidean distance between x1 and x2

euclidean_distance(x1, x3)

5.830951894845301

In [9]:
#Cosine Similarity between x1 and x2

cosine_similarity(x1, x3)

0.98058067569092

### Effect of Normalization

In [14]:
#Normalize functions

def l1_normalize(v):
    norm = np.sum(np.abs(v))
    return v / norm

def l2_normalize(v):
    norm = np.sqrt(np.sum(np.square(v)))
    return v / norm

In [15]:
#Normalizing the vectors

x11 = l2_normalize(x1)
x21 = l2_normalize(x2)
x31 = l2_normalize(x3)

In [16]:
euclidean_distance(x31, np.array([0,0]))

1.0

In [17]:
#Euclidean distance between normalized x1 and x2

euclidean_distance(x11, x21)

0.19707523593328433

In [18]:
#Euclidean distance between normalized x1 and x3

euclidean_distance(x11, x31)

0.19707523593328433

In [19]:
print(x31, x21)

[0.83205029 0.5547002 ] [0.83205029 0.5547002 ]


In [20]:
#Cosine similarity between normalized x1 and x2

cosine_similarity(x11, x31)

0.9805806756909202

### Collecting some data from Wikipedia

In [21]:
#The required library
#!pip install wikipedia
import wikipedia

In [23]:
### Data of Pro Boxers
result = wikipedia.search("Pro Boxers", results = 20)
result


['List of deaths due to injuries sustained in boxing',
 'Boxer Rebellion',
 'Boxing',
 'Freddie Roach (boxing)',
 'Weight class (boxing)',
 'Vijender Singh',
 'New England Golden Gloves',
 'Knockout Kings (video game)',
 'Terry Norris',
 'Jeff Mayweather',
 'Pernell Whitaker',
 'Thammudu (film)',
 'Amateur boxing',
 'Jerry Quarry',
 'Nico Hernandez',
 'Heavyweight',
 'Katsu!',
 'Charles Williams (boxer)',
 'Leonard Gardner',
 'Leon Spinks']

In [None]:
cricket1 = wikipedia.page('Ian Bishop (cricketer)')
cricket2 = wikipedia.page('Cricket')
cricket3 = wikipedia.page('The cricketers')
cricket4 = wikipedia.page('Wisden Cricketers of the Year')
cricket5 = wikipedia.page('T. Natarajan')
footballer1 = wikipedia.page('Michael Stewart (footballer)')
footballer2 = wikipedia.page('Lee Johnson (footballer)')
footballer3 = wikipedia.page('College football')
footballer4 = wikipedia.page('Maryland Terrapins football')
footballer5 = wikipedia.page('Ronaldo')
physicist1 = wikipedia.page('Sean M. Carroll')
physicist2 = wikipedia.page('Alexander Sergeev (physicist)')
physicist3 = wikipedia.page('Theoretical physics')
physicist4 = wikipedia.page('J. J. Thomson')
physicist5 = wikipedia.page('Arindam Ghosh (physicist)')
statistician1 = wikipedia.page('Edward Jones (statistician)')
statistician2 = wikipedia.page('Robert Gentleman (statistician)')
statistician3 = wikipedia.page('Wayne Smith (statistician)')
statistician4 = wikipedia.page('Sherman–Morrison formula')
statistician5
statistician5 = wikipedia.page('National Statistician')


In [24]:
Proboxer1 = wikipedia.page('Leonard Gardner (boxer)')
Proboxer2 = wikipedia.page('Leon Spinks(boxer)')
Proboxer3 = wikipedia.page('Vijender Singh(boxer)')
Proboxer4 = wikipedia.page('Pernell Whitaker(boxer)')
Proboxer5 = wikipedia.page('Jerry Quarry(boxer)')

In [25]:
ProMMA1 = wikipedia.page('Al Iaquinta(Pro MMA)')
ProMMA2 = wikipedia.page('Bashir Ahmad(Pro MMA)')
ProMMA3 = wikipedia.page('Felicia Spencer(Pro MMA)')
ProMMA4 = wikipedia.page('Alexander Emelianenko(Pro MMA)')
ProMMA5 = wikipedia.page('Ronda Rousey(Pro MMA)')

In [26]:
### Data of Pro MMA
result = wikipedia.search("Pro MMA", results = 20) 
result

['List of professional MMA training camps',
 'Justin Wren',
 'Kevin Ferguson Jr.',
 'Mixed martial arts',
 'Alexander Emelianenko',
 'Fatalities in mixed martial arts contests',
 "Women's mixed martial arts",
 'Gerald Meerschaert',
 'Ronda Rousey',
 'Amanda Lucas (fighter)',
 'Fallon Fox',
 'Sergey Spivak',
 'Jason David Frank',
 'Felicia Spencer',
 'Alain Ngalani',
 'Sara McMann',
 'Greg Hardy',
 'Al Iaquinta',
 'Phil Baroni',
 'Bashir Ahmad (mixed martial artist)']

In [27]:
### Data of physicist
result = wikipedia.search("physicist", results = 20) 
result

['Physicist',
 'The Physicists',
 'Brian Cox (physicist)',
 'Theoretical physics',
 'Physicist (album)',
 'Albert Einstein',
 'Physicist and Christian',
 'List of physicists',
 'Nuclear physics',
 'Medical physicist',
 'Chartered Physicist',
 'Experimental physics',
 'Particle physics',
 'Arindam Ghosh (physicist)',
 'J. J. Thomson',
 'Sean M. Carroll',
 'Quantum mechanics',
 'William Thomson, 1st Baron Kelvin',
 'John Archibald Wheeler',
 'Scientist']

In [28]:
Statistician1 = wikipedia.page('Wayne Smith(Statistician)')
Statistician2 = wikipedia.page('James Beckett(Statistician)')
Statistician3 = wikipedia.page('Adrian Smith(Statistician)')
Statistician4 = wikipedia.page('Edward Jones(Statistician)')
Statistician5 = wikipedia.page('Ian Diamond(Statistician)')

In [29]:
### Data of Statistician
result = wikipedia.search("Statistician", results = 20) 
result


['Statistician',
 'James Beckett (statistician)',
 'Chartered Statistician',
 'Edward Jones (statistician)',
 'The American Statistician',
 'National Statistical Commission',
 'Chief Statistician of Canada',
 'National Statistician',
 'Wayne Smith (statistician)',
 'Law of the unconscious statistician',
 'Australian Bureau of Statistics',
 'Robert Gentleman (statistician)',
 'Government Statistical Service',
 'Mathematical statistics',
 'Adrian Smith (statistician)',
 'Ian Diamond',
 'David Cox (statistician)',
 'Sherman–Morrison formula',
 'First-class cricket',
 'Office for National Statistics']

In [30]:
### Data of Business Leaders
result = wikipedia.search("Business Leaders", results = 20) 
result

['Business acumen',
 'Leader of Government Business',
 'Asia-Pacific Economic Cooperation',
 'Business Leaders for Michigan',
 'Arab business leaders',
 'FBLA-PBL',
 'Thought leader',
 'Business Leaders for Sensible Priorities',
 'Business partnering',
 'Party leaders of the United States Senate',
 'Licious',
 'Business intelligence',
 'International Business Leaders Forum',
 'Business oligarch',
 'Douglas Pitt',
 'Queensland Business Leaders Hall of Fame',
 'World Economic Forum',
 'The Business Council',
 'Marc Benioff',
 'Spectacular Smith']

In [31]:
BusinessLeaders1 = wikipedia.page('Douglas Pitt(Business Leaders)')
BusinessLeaders2 = wikipedia.page('Marc Benioff(Business Leaders)')
BusinessLeaders3 = wikipedia.page('Spectacular Smith(Business Leaders)')
BusinessLeaders4 = wikipedia.page('FBLA-PBL(Business Leaders)')
BusinessLeaders5 = wikipedia.page('Business intelligence(Business Leaders)')

In [32]:
#Getting the wikipedia pages on machine learning, artificial intelligence, soccer and tennis

Box = wikipedia.page("Boxers")
BL = wikipedia.page("Business Leaders")
MMA = wikipedia.page("Pro MMA")
Phy = wikipedia.page("physicist")
Stat = wikipedia.page("Statistician")

In [33]:
#Printing the content of the pages
Box.content[:205]

'Boxing is a combat sport in which two people, usually wearing protective gloves, throw punches at each other for a predetermined amount of time in a boxing ring.\nAmateur boxing is both an Olympic and Commo'

In [34]:
BL.content[:205]

'Business acumen ("Business savvy" and "business sense" are often used as synonyms) is keenness and quickness in understanding and dealing with a "business situation" (risks and opportunities) in a manner t'

In [38]:
MMA.content[:205]

'This is a list of notable present professional training camps and gyms in Mixed Martial Arts (MMA).Most professional MMA fighters in the UFC, Bellator and other MMA promotions join a professional fight cam'

In [39]:
Phy.content[:205]

"Physics (from Ancient Greek: φυσική (ἐπιστήμη), romanized: physikḗ (epistḗmē), lit. 'knowledge of nature', from φύσις phýsis 'nature') is the natural science that studies matter, its motion and behavior th"

In [40]:
Stat.content[:205]

'A statistician is a person who works with theoretical or applied statistics. The profession exists in both the private and public sectors. It is common to combine statistical knowledge with expertise in ot'

In [42]:
#Number of words present in each articles

print("Box \t", len(Box.content.split()), "\n"
      "BL\t", len(BL.content.split()), "\n"
      "MMA \t", len(MMA.content.split()), "\n"
      "Phy \t", len(Phy.content.split()))

Box 	 10595 
BL	 1111 
MMA 	 252 
Phy 	 5675


      W1 W2 W3 W4 W5

ML = (2, 5, 0, 0, 10)
AI = (8, 22, 0, 0, 38)

In [43]:
Box = np.array([2, 5, 0, 0, 10])
MMA = np.array([8, 22, 0, 0, 38])

In [44]:
Phy = np.array([2, 5, 0, 0, 10])
Stat = np.array([8, 22, 0, 0, 38])

In [45]:
euclidean_distance(Box, MMA)

33.301651610693426

In [46]:
euclidean_distance(Phy,Stat)

33.301651610693426

In [47]:
cosine_similarity(Phy, Stat)

0.9981848973292723

In [48]:
# Creating count vectors
# Use count vectorizer to create a vector of word count

from sklearn.feature_extraction.text import CountVectorizer

In [49]:
cv = CountVectorizer()
X = np.array(cv.fit_transform([Box.content,BL.content,Stat.content, MMA.content,Phy.content]).toarray())

AttributeError: 'numpy.ndarray' object has no attribute 'content'

### Creating count vectors

In [92]:
#Use count vectorizer to create a vector of word count

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = np.array(cv.fit_transform([Box.content, MMA.content, Stat.content, Phy.content,BL.content]).toarray())

AttributeError: 'numpy.ndarray' object has no attribute 'content'

In [89]:
X

array(<bound method CountVectorizer.fit_transform of CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)>, dtype=object)

### Calculating the distance among the articles

In [50]:
#Euclidien distances between ML and Rest

print("Euclidean distance between ML-AI: \t", euclidean_distance(X[0],X[1]))
print("Euclidean distance between ML-Soccer: \t", euclidean_distance(X[0],X[2]))
print("Euclidean distance between ML-Tennis: \t", euclidean_distance(X[0],X[3]))


NameError: name 'X' is not defined

In [44]:
print("Euclidean distance between AI-Tennis: \t", euclidean_distance(X[1],X[3]))

Euclidean distance between AI-Tennis: 	 644.1513797237417


In [48]:
#Euclidien distances between ML and Rest

print("Euclidean distance between AI-ML: \t", euclidean_distance(X[0],X[1]))
print("Euclidean distance between AI-Soccer: \t", euclidean_distance(X[1],X[2]))
print("Euclidean distance between AI-Tennis: \t", euclidean_distance(X[1],X[3]))


Euclidean distance between AI-ML: 	 456.7307740890688
Euclidean distance between AI-Soccer: 	 502.6927491022722
Euclidean distance between AI-Tennis: 	 644.1513797237417


In [45]:
#Cosine similarity between ML and Rest

print("Cosine Similarity")
print("ML - AI \t", cosine_similarity(X[0],X[1]), "\n",
     "ML - Soccer \t", cosine_similarity(X[0],X[2]), "\n",
     "ML - Tennis \t", cosine_similarity(X[0],X[3]), "\n")

Cosine Similarity
ML - AI 	 0.8830074014780043 
 ML - Soccer 	 0.7926233557280901 
 ML - Tennis 	 0.8073111478120881 



### Categorizing a Tweet

In [46]:
#Here's an ML tweet

ml_tweet = "New research release: overcoming many of Reinforcement Learning's limitations with Evolution Strategies."
x = np.array(cv.transform([ml_tweet]).toarray())[0]

In [47]:
#Euclindean Distances
print("Euclidean distance")

print("tweet - ML \t", euclidean_distance(x[0], X[0]), "\n"
      "tweet - AI \t", euclidean_distance(x[0], X[1]), "\n"
      "tweet - soccer \t", euclidean_distance(x[0], X[2]), "\n"
      "tweet - tennis \t", euclidean_distance(x[0], X[3]))

Euclidean distance
tweet - ML 	 611.3885834720828 
tweet - AI 	 895.1865727321875 
tweet - soccer 	 758.5004943966748 
tweet - tennis 	 1201.5073865773777


In [None]:
#Cosine Similarities

print("Cosine Similarity")

print("tweet - ML \t", cosine_similarity(x, X[0]), "\n"
      "tweet - AI \t", cosine_similarity(x, X[1]), "\n"
      "tweet - soccer \t", cosine_similarity(x, X[2]), "\n"
      "tweet - tennis \t", cosine_similarity(x, X[3]))

In [None]:
#A soccer tweet, by Manchester United

so_tweet = "#LegendsDownUnder The Reds are out for the warm up at the @nibStadium. Not long now until kick-off in Perth."
x2 = np.array(cv.transform([so_tweet]).toarray())[0]

In [None]:
#Euclidean Distance

print("tweet - ML \t", euclidean_distance(x2, X[0]), "\n"
      "tweet - AI \t", euclidean_distance(x2, X[1]), "\n"
      "tweet - soccer \t", euclidean_distance(x2, X[2]), "\n"
      "tweet - tennis \t", euclidean_distance(x2, X[3]))

In [None]:
#Cosine Similarity

print("tweet - ML \t", cosine_similarity(x2, X[0]), "\n"
      "tweet - AI \t", cosine_similarity(x2, X[1]), "\n"
      "tweet - soccer \t", cosine_similarity(x2, X[2]), "\n"
      "tweet - tennis \t", cosine_similarity(x2, X[3]))

*Reference:* https://cmry.github.io/notes/euclidean-v-cosine