In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import chardet
import string

def read_file(file_path):
    """
    Reads the contents of a file and returns it as a string, ignoring upper and lower case letters and punctuation marks.
    """
    with open(file_path, 'rb') as f:
        # Use chardet to detect the encoding of the file
        result = chardet.detect(f.read())
        encoding = result['encoding']

    # Read the file using the detected encoding
    with open(file_path, 'r', encoding=encoding) as f:
        contents = f.read()

    # Remove punctuation and convert to lowercase
    contents = contents.translate(str.maketrans('', '', string.punctuation))
    contents = contents.lower()

    return contents
    
import math
from collections import Counter

def get_word_counts(text):
    """
    Returns a Counter object containing the word frequencies in a text.
    """
    words = text.split()
    return Counter(words)

def cosine_similarity(file1, file2):
    """
    Calculates the cosine similarity between two text files using the bag-of-words model.
    """
    # Read the contents of the two files
    text1 = read_file(file1)
    text2 = read_file(file2)

    # Get the word counts for each text
    word_counts1 = get_word_counts(text1)
    word_counts2 = get_word_counts(text2)

    # Get the set of all unique words in both texts
    unique_words = set(word_counts1.keys()) | set(word_counts2.keys())

    # Convert the word counts to vectors
    vector1 = [word_counts1.get(word, 0) for word in unique_words]
    vector2 = [word_counts2.get(word, 0) for word in unique_words]

    # Calculate the cosine similarity
    dot_product = sum([vector1[i] * vector2[i] for i in range(len(vector1))])
    magnitude1 = math.sqrt(sum([vector1[i] ** 2 for i in range(len(vector1))]))
    magnitude2 = math.sqrt(sum([vector2[i] ** 2 for i in range(len(vector2))]))
    similarity = dot_product / (magnitude1 * magnitude2)

    return similarity



In [12]:
similarity_score = cosine_similarity('', '')
print(f"Similarity score: {similarity_score}")


Similarity score: 0.9360545790329462
