# Determining Similarity Between Text Files Using Cosine Similarity

![Steps](texts-similarity-steps.png)

In [None]:
import numpy as np
from collections import Counter
import string

In [None]:
# basic stopwords
STOPWORDS = {"a", "an", "the", "is", "are", "was", "were", "in", "on", "at", "by", "with", "for", "of",
             "and", "or", "to", "this", "that", "it", "he", "she", "they", "we", "you", "have", "has"}

### Implement each step

In [None]:
def read_file(filename: str) -> str:
    with open(filename, 'r') as file:
        return file.read().lower()

In [None]:
def remove_punctuation(text: str) -> str:
    translator = str.maketrans('', '', string.punctuation)

    return text.translate(translator)

In [None]:
# Tokenize and remove stopwords
def tokenize(text: str) -> list[str]:
    text = remove_punctuation(text)
    tokens = text.split()
    tokens = filter(lambda token: token not in STOPWORDS, tokens)
    return list(tokens)

In [None]:
# create word frequency vector
def vectorize(tokens: list[str], vocabulary: set[str]) -> np.ndarray:
    word_counts: dict = Counter(tokens)
    word_vector = [word_counts.get(word, 0) for word in vocabulary]

    dict(linear=2, machine=5, learning=10, cosine=0)

    return np.array(word_vector, dtype=float)

In [None]:
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)

    return dot_product / (norm1 * norm2) if norm1 * norm2 != 0 else 0

### Putting things together

In [None]:
file1_path: str = "file1.txt"
file2_path: str = "file2.txt"
file3_path: str = "file3.txt"

# Read content
text1: str = read_file(file1_path)
text2: str = read_file(file2_path)
text3: str = read_file(file3_path)

# tokenize
tokens1 = tokenize(text1)
tokens2 = tokenize(text2)
tokens3 = tokenize(text3)

# Build vocabulary (unique words from both texts after stopword removal)
vocabulary = set(tokens1 + tokens2 + tokens3)

# Vectorize texts
vec1 = vectorize(tokens1, vocabulary)
vec2 = vectorize(tokens2, vocabulary)
vec3 = vectorize(tokens3, vocabulary)

# Compute similarity
similarity12: float = cosine_similarity(vec1, vec2)
similarity13: float = cosine_similarity(vec1, vec3)
similarity23: float = cosine_similarity(vec2, vec3)

print(f"Cosine Similarity 1 & 2 : {similarity12:.4f}")
print(f"Cosine Similarity 1 & 3 : {similarity13:.4f}")
print(f"Cosine Similarity 2 & 3 : {similarity23:.4f}")