# Cosine Similarity

There are several ways to calculate the similarity between two blocks of text in Python. A common method is to use cosine similarity, which measures the cosine of the angle between two vectors in a multidimensional space. The code below will print the cosine similarity score between the two input text blocks, which ranges from 0 to 1. A score closer to 1 indicates higher similarity.

In [1]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

# Download the stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Function to calculate cosine similarity between two texts
def calculate_similarity(text1, text2):
    # Preprocess the texts
    text1 = preprocess_text(text1)
    text2 = preprocess_text(text2)

    # Create the TfidfVectorizer object
    vectorizer = TfidfVectorizer()
    # Transform the texts to tf-idf vectors
    vectors = vectorizer.fit_transform([text1, text2])

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(vectors)
    similarity_score = similarity_matrix[0][1]

    return similarity_score

# Example texts
text1 = "Natural language processing makes it possible for computers to understand human language."
text2 = "Computers are able to comprehend human language through natural language processing."

# Calculate similarity
similarity = calculate_similarity(text1, text2)
print(f"Similarity score: {similarity}")

Similarity score: 0.6201272584968651


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Jaccard Similarity

In [2]:
import nltk
import string

# Download the stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

# Function to calculate Jaccard similarity between two texts
def calculate_jaccard_similarity(text1, text2):
    # Preprocess the texts
    words1 = preprocess_text(text1)
    words2 = preprocess_text(text2)

    # Convert the lists of words to sets
    set1 = set(words1)
    set2 = set(words2)

    # Calculate intersection and union
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    # Calculate Jaccard similarity
    jaccard_similarity = len(intersection) / len(union)

    return jaccard_similarity

# Example texts
text1 = "Natural language processing makes it possible for computers to understand human language."
text2 = "Computers are able to comprehend human language through natural language processing."

# Calculate similarity
similarity = calculate_jaccard_similarity(text1, text2)
print(f"Jaccard Similarity score: {similarity}")

Jaccard Similarity score: 0.5


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Document Embeddings

In [5]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
import string

# Download the stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

# Example texts
text1 = "Natural language processing makes it possible for computers to understand human language."
text2 = "Computers are able to comprehend human language through natural language processing."

# Preprocess the texts
documents = [preprocess_text(text1), preprocess_text(text2)]

# Create TaggedDocument objects for training
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

# Train a Doc2Vec model
model = Doc2Vec(tagged_documents, vector_size=50, window=2, min_count=1, workers=4)

# Infer vectors for the documents
vector1 = model.infer_vector(preprocess_text(text1))
vector2 = model.infer_vector(preprocess_text(text2))

# Calculate cosine similarity
#similarity = calculate_similarity(vector1, vector2)
# Create the TfidfVectorizer object
vectorizer = TfidfVectorizer()
# Transform the texts to tf-idf vectors
vectors = vectorizer.fit_transform([text1, text2])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(vectors)
similarity = similarity_matrix[0][1]

print(f"Doc2Vec Similarity score: {similarity}")

Doc2Vec Similarity score: 0.5038711573210972


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Aside: Convert PDF to Text to Extract Resume Text

The below code block will be a useful starting point to convert resume PDFs to text.

In [6]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.4 PyMuPDFb-1.24.3


In [None]:
import fitz  # PyMuPDF

# Function to convert PDF to text
def pdf_to_text(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    text = ""

    # Iterate over each page and extract text
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    return text

# Example usage with my resume
pdf_path = '/content/Parikh_Ayush_2024_Resume.pdf'  # Path to your PDF file
text = pdf_to_text(pdf_path)
print(text)