### Importing Libraries

In [1]:
import math
import re
from collections import Counter


### regular expression
 - \w matches any alphanumeric character and the underscore
 - '+' causes the RE to match 1 or more repetitions of the preceding RE

In [2]:
WORD = re.compile(r'\w+')

### Text to Vector Conversion

In [3]:
def text2vec(text):
    words = WORD.findall(text)
    # unordered collection where elements are stored as dict keys, and counts are stored as dict vals
    return Counter(words)

### Cosine Distance Calculation

In [4]:
def cosDist(vector1, vector2):
    # set of unordered collection of unique items
    intersection = set(vector1.keys()) & set(vector2.keys())  # return set with elements in intersection
    numerator = sum([vector1[x] * vector2[x] for x in intersection])
    
    sum1 = sum([vector1[x] ** 2 for x in vector1.keys()])
    sum2 = sum([vector2[x] ** 2 for x in vector2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

### Defining a function to read files

In [5]:
def readFile(filename):
    return open("../data/"+filename, 'r').read()

In [6]:
text1 = readFile("nowIsTheTime.txt")
text2 = readFile("quickBrownFox.txt")
print("Text1 is: ", text1)

print("Text2 is: ", text2)

Text1 is:  Now is the time for all good men to come to the aid of the party
Text2 is:  The quick brown fox jumps over the lazy dog


In [7]:
vec1 = text2vec(text1)
vec2 = text2vec(text2)

cosine = cosDist(vec1, vec2)

print("Cosine Distance:\t", round(cosine, 3))

Cosine Distance:	 0.204
