# Cosine Similarity

There are several ways to calculate the similarity between two blocks of text in Python. A common method is to use cosine similarity, which measures the cosine of the angle between two vectors in a multidimensional space. The code below will print the cosine similarity score between the two input text blocks, which ranges from 0 to 1. A score closer to 1 indicates higher similarity.

In [20]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

# Download the stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Function to calculate cosine similarity between two texts
def calculate_similarity(text1, text2):
    # Preprocess the texts
    text1 = preprocess_text(text1)
    text2 = preprocess_text(text2)

    # Create the TfidfVectorizer object
    vectorizer = TfidfVectorizer()
    # Transform the texts to tf-idf vectors
    vectors = vectorizer.fit_transform([text1, text2])

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(vectors)
    similarity_score = similarity_matrix[0][1]

    return similarity_score

# Example texts
text1 = "Natural language processing makes it possible for computers to understand human language."
text2 = "Computers are able to comprehend human language through natural language processing."

# Calculate similarity
similarity = calculate_similarity(text1, text2)
print(f"Similarity score: {similarity}")

Similarity score: 0.6201272584968651


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Jaccard Similarity

In [2]:
import nltk
import string

# Download the stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to preprocess the text
def preprocess_text_jacc(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

# Function to calculate Jaccard similarity between two texts
def calculate_jaccard_similarity(text1, text2):
    # Preprocess the texts
    words1 = preprocess_text_jacc(text1)
    words2 = preprocess_text_jacc(text2)

    # Convert the lists of words to sets
    set1 = set(words1)
    set2 = set(words2)

    # Calculate intersection and union
    intersection = set1.intersection(set2)
    union = set1.union(set2)

    # Calculate Jaccard similarity
    jaccard_similarity = len(intersection) / len(union)

    return jaccard_similarity

# Example texts
text1 = "Natural language processing makes it possible for computers to understand human language."
text2 = "Computers are able to comprehend human language through natural language processing."

# Calculate similarity
similarity = calculate_jaccard_similarity(text1, text2)
print(f"Jaccard Similarity score: {similarity}")

Jaccard Similarity score: 0.5


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Document Embeddings

In [5]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
import string

# Download the stopwords from nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Function to preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

# Example texts
text1 = "Natural language processing makes it possible for computers to understand human language."
text2 = "Computers are able to comprehend human language through natural language processing."

# Preprocess the texts
documents = [preprocess_text(text1), preprocess_text(text2)]

# Create TaggedDocument objects for training
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

# Train a Doc2Vec model
model = Doc2Vec(tagged_documents, vector_size=50, window=2, min_count=1, workers=4)

# Infer vectors for the documents
vector1 = model.infer_vector(preprocess_text(text1))
vector2 = model.infer_vector(preprocess_text(text2))

# Calculate cosine similarity
#similarity = calculate_similarity(vector1, vector2)
# Create the TfidfVectorizer object
vectorizer = TfidfVectorizer()
# Transform the texts to tf-idf vectors
vectors = vectorizer.fit_transform([text1, text2])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(vectors)
similarity = similarity_matrix[0][1]

print(f"Doc2Vec Similarity score: {similarity}")

Doc2Vec Similarity score: 0.5038711573210972


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocess Data

In [15]:

import os
import pandas as pd

mentees = pd.read_excel('./drive/MyDrive/Georgia Tech/CS 8903/mentees.xlsx')
mentors = pd.read_excel('./drive/MyDrive/Georgia Tech/CS 8903/mentors.xlsx')
#print(mentees.head())
mentors.head()[0:1]

Unnamed: 0,Id,Start time,Completion time,Email,Name,Full name,Georgia Tech email address (.edu),Please describe your past professional experience.,Please describe your past academic experience.,What are your goals from participating in this program?,Have you ever published an academic paper?,What are your plans after graduating from OMSCS?,Please upload your resume as a PDF.
0,1,2024-05-27 10:41:14,2024-05-27 10:44:05,jshimer3@gatech.edu,Jacob Shimer,Jacob Shimer,jshimer3@gatech.edu,4 years as a backend programmer for a R&D comp...,Achieved my Bachelors of Science in Computer S...,,No,"No plans currently, continuing working as a so...",https://gtvault-my.sharepoint.com/personal/apa...


In [14]:
mentees_responses = mentees[['Name',
                             'Please describe your past professional experience.',
                             'Please describe your past academic experience.',
                             'What are your goals from participating in this program?',
                             'Why are you interested in research?',
                             'What are your career goals?',
                             'What are your plans after graduating from OMSCS?']]
mentees_responses.head()[0:1]

Unnamed: 0,Name,Please describe your past professional experience.,Please describe your past academic experience.,What are your goals from participating in this program?,Why are you interested in research?,What are your career goals?,What are your plans after graduating from OMSCS?
0,Ahmed Mohamed,My past professional experience has been prima...,My past academic experience before starting OM...,My goals from participating in this program is...,I am definitely very new to the world of resea...,My career goals have been shifting over the pa...,After graduating from OMSCS I would like to ex...


In [17]:
mentors_responses = mentors[['Name',
                             'Please describe your past professional experience.',
                             'Please describe your past academic experience.',
                             'What are your goals from participating in this program?',
                             'What are your plans after graduating from OMSCS?']]
mentors_responses.head()[0:1]

Unnamed: 0,Name,Please describe your past professional experience.,Please describe your past academic experience.,What are your goals from participating in this program?,What are your plans after graduating from OMSCS?
0,Jacob Shimer,4 years as a backend programmer for a R&D comp...,Achieved my Bachelors of Science in Computer S...,,"No plans currently, continuing working as a so..."


In [21]:
def unify_text(df):
  """
  Unifies text across all columns left to right except the column called Name and creates a new column "Unified Text".
  """

  # Get all column names except "Name".
  cols = [col for col in df.columns if col != "Name"]

  # Concatenate the values from all columns except "Name" with a space.
  df["Unified Text"] = df[cols].apply(lambda row: " ".join(row.astype(str).values), axis=1)

  return df

mentors_responses = unify_text(mentors_responses)
mentees_responses = unify_text(mentees_responses)

mentors_responses['Unified Text'] = mentors_responses['Unified Text'].apply(lambda x : preprocess_text(x))
mentees_responses['Unified Text'] = mentees_responses['Unified Text'].apply(lambda x : preprocess_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Unified Text"] = df[cols].apply(lambda row: " ".join(row.astype(str).values), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mentors_responses['Unified Text'] = mentors_responses['Unified Text'].apply(lambda x : preprocess_text(x))


In [23]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(df_mentors, df_mentees, similarity_method):
  """
  Computes cosine similarity between "Unified Text" columns of every mentor and mentee.
  """

  # Create a list to store the results.
  results = []

  # Loop through each mentor.
  for i, mentor_row in df_mentors.iterrows():
    # Get the mentor's "Unified Text".
    mentor_text = mentor_row["Unified Text"]

    # Loop through each mentee.
    for j, mentee_row in df_mentees.iterrows():
      # Get the mentee's "Unified Text".
      mentee_text = mentee_row["Unified Text"]

      # Calculate the cosine similarity between the mentor and mentee texts.
      similarity = similarity_method(mentor_text, mentee_text)

      # Store the result in the list.
      results.append({
        "Mentor": mentor_row["Name"],
        "Mentee": mentee_row["Name"],
        "Cosine Similarity": similarity
      })

  # Convert the results to a DataFrame.
  df_results = pd.DataFrame(results)

  return df_results

df_results = compute_cosine_similarity(mentors_responses, mentees_responses, calculate_similarity)
print(df_results)

          Mentor                      Mentee  Cosine Similarity
0   Jacob Shimer               Ahmed Mohamed           0.153252
1   Jacob Shimer  Guru Raj Vaishnav Akuthota           0.140040
2   Jacob Shimer             Andrei Valasiuk           0.068111
3   Jacob Shimer              Han Qiao Zhang           0.031175
4   Jacob Shimer               Chenghao Wang           0.090290
..           ...                         ...                ...
95   Thomas Orth                  Ren Yi Tan           0.085307
96   Thomas Orth              Kelsi Blauvelt           0.120600
97   Thomas Orth                  Yallen Bai           0.054397
98   Thomas Orth          Bharat Raghunathan           0.064327
99   Thomas Orth               Hannah Ismail           0.052795

[100 rows x 3 columns]


In [25]:
def assign_mentees_by_similarity(df_results):
  """
  Assigns mentees to mentors based on cosine similarity, avoiding duplicates.
  """

  # Create a dictionary to store the assigned mentees.
  assigned_mentees = {}

  # Loop through each mentor.
  for mentor, group_df in df_results.groupby("Mentor"):
    # Sort the mentees by cosine similarity in descending order.
    sorted_mentees = group_df.sort_values(by="Cosine Similarity", ascending=False)

    # Assign the first available mentee.
    for mentee in sorted_mentees["Mentee"]:
      if mentee not in assigned_mentees.values():
        assigned_mentees[mentor] = mentee
        break

  return assigned_mentees

assigned_mentees = assign_mentees_by_similarity(df_results)
print(assigned_mentees)

{'Adam Thomas': 'Bharat Raghunathan', 'Aiden Campbell': 'Chenghao Wang', 'Anita Cheung': 'Ahmed Mohamed', 'Elan Grossman': 'Kelsi Blauvelt', 'Jacob Shimer': 'Hannah Ismail', 'Junsoo Park': 'Yallen Bai', 'Kailey Cozart': 'Ren Yi Tan', 'Nikhil Kapila': 'Guru Raj Vaishnav Akuthota', 'Thomas Deatherage': 'Andrei Valasiuk', 'Thomas Orth': 'Han Qiao Zhang'}


# Aside: Convert PDF to Text to Extract Resume Text

The below code block will be a useful starting point to convert resume PDFs to text.

In [6]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.4 PyMuPDFb-1.24.3


In [8]:
import fitz  # PyMuPDF

# Function to convert PDF to text
def pdf_to_text(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    text = ""

    # Iterate over each page and extract text
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()

    return text

# Example usage with my resume
pdf_path = '/content/Parikh_Ayush_2024_Resume.pdf'  # Path to your PDF file
text = pdf_to_text(pdf_path)
print(text)

 
AYUSH PARIKH 
ayushnparikh@gmail.com | (732) 762-8460 | Seattle, WA 
linkedin.com/in/a-parikh | github.com/ayushparikh-microsoft | github.com/ayusoccer 
 
EDUCATION 
Georgia Institute of Technology, School of Computing | Atlanta, GA   
 
                                                             May 2025 
Masters of Science in Computer Science                                                                                                            Current Cumulative GPA: 4.0/4.0 
Specialization: Machine Learning 
Coursework: Machine Learning for Trading, Deep Learning, Network Science, Machine Learning, Big Data for Health, Quantum 
Computing – substantial programming experience in Python through all of these classes; final projects such as RL trading bot 
 
University of Pennsylvania, School of Engineering and Applied Science | Philadelphia, PA 
 
 
             May 2021 
Bachelor of Science in Engineering 
Major: Computer Science | Minors: Engineering Entrepreneurship, Data Scie