In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


# Optional: download nltk stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/school/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Display full width of text in each column
pd.set_option('display.max_colwidth', None)

# Optional: display more columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)  # Widen total printed width

# 📂 2. Load data
df = pd.read_csv("responsibilities.csv")  # Or wherever you saved your file
print(df.loc[0, 'responsibilities'])  # first row
example = df.loc[0, 'responsibilities']

# Drop rows where 'responsibilities' is NaN or an empty string after stripping whitespace
df = df[~df['responsibilities'].isna()]                 # removes NaN
df = df[df['responsibilities'].str.strip() != '']      # removes empty strings or strings with only whitespace

# we only care about these following titles
# Sample predefined keywords
keywords = ['Data Scientist', 'Analyst', 'Engineer', 'Data']
# Keep rows where 'title' contains any of the keywords (case-insensitive)
df = df[df['title'].str.contains('|'.join(keywords), case=False, na=False)]


Building a high-quality and scalable state-of-the-art data warehouse that powers all decision making | Leveraging machine learning and algorithms to help Wealthsimple build smarter financial products | Using decision science to understand the cause and effect of our business decisions | Are driven to ship frequently | Take ownership and pride in working on projects to successful completion | Enjoy tackling new, complex challenges on a regular basis | Are comfortable with self-directed learning | Believe that debate, inclusivity and transparency result in better products | Are eager to learn from your team and others. We value making one another successful! | Familiarity with Python, SQL | Basic understanding of statistics: either frequentist and Bayesian approaches | Basic understanding of fundamental machine-learning algorithms: regression and decision trees | First-hand experience working with popular Python libraries such as Pandas, scikt-learn, numpy and Jupyter | Excellent communi

In [4]:
# i notice there are words that I should remove

def parse_job_keywords(text):
    return [item.strip() for item in text.split('|')]
#text = "Python | SQL | machine learning | data analysis"
# ['Python', 'SQL', 'machine learning', 'data analysis']
# Example usage
input_text = """Government | Administrative Assistant | Social Worker | Teacher | Part time | Remote | Weekend | Summer | Temporary | Amazon | Air Canada | Care.com | Tim Hortons | Starbucks | Home Depot | Scotiabank | Walmart | Sobey's | BMO Harris Bank | Calgary | Edmonton | Winnipeg | Brampton | Ottawa | Saskatoon | Toronto | Windsor | London | Regina | Browse jobs | Submit a feed | About | Careers | Contact | Privacy | Terms & Conditions"""

lst_of_unimportant_words = parse_job_keywords(input_text)

In [81]:
example = df[4:5]["responsibilities"]
print(example)

6    AI Factory - Versatile Teams Operating in Cross Functional Pods:Utilizing digital and data resources to develop AI products, bringing data management, AI and product development skills to products, programs and projects to create an agile, fulfilling and meaningful work environment. | Leading Edge Tech Stack:Experience building products that will be deployed globally on a leading-edge tech stack. | World Class Mentorship and Training:Working with renowned leaders and academics in machine learning to further develop your skillsets | Propose and establish technical designs to meet business and technical requirements | Develop and maintain data engineering solutions based on requirements and design specifications using appropriate tools and technologies | Create data pipelines / ETL pipelines and optimize performance | Test and validate developed solution to ensure it meets requirements | Create design and development documentation based on standards for knowledge transfer, training,

# lemmatize, lower case, deduplicate repeating rows,Split on pipe symbol

In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# Splits text into sentences and Splits sentences into words resp
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer #Converts text into numerical vectors using TF-IDF
from sklearn.metrics.pairwise import cosine_similarity #Measures similarity between two text vectors

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize(sentence):
    words = word_tokenize(sentence.lower())
    filtered = [lemmatizer.lemmatize(w) for w in words if w.isalpha() and w not in stop_words]
    return " ".join(filtered) # Joins the cleaned words back into a single string.

def deduplicate_fuzzy(sentences, threshold=0.9):
    if len(sentences) <= 1:
        return sentences

    vectorizer = TfidfVectorizer().fit_transform(sentences) #Vectorizes each sentence using TF-IDF.
    sim_matrix = cosine_similarity(vectorizer) #Computes cosine similarity between all sentence pairs.

    keep = []
    seen = set()
    for i, row in enumerate(sim_matrix):
        if i in seen:
            continue
        keep.append(sentences[i])
        for j in range(i + 1, len(row)):
            if row[j] >= threshold:
                seen.add(j)
    return keep

def preprocess_responsibility(responsibility_str, lst_of_unimportant_words=None):
    #If the input is not a string (e.g., null), return an empty list.
    if not isinstance(responsibility_str, str):
        return []
    #Initialize a list of garbage or "ignore" sentences (optional filter).
    if lst_of_unimportant_words is None:
        lst_of_unimportant_words = []

    # Step 1: Break on pipe and segment sentences
    #Splits the string on the pipe | delimiter.
    #Tokenizes each part into sentences if it's longer than 10 characters.
    parts = responsibility_str.split("|")
    raw_sentences = []
    for part in parts:
        part = part.strip()
        if len(part) > 10:
            raw_sentences.extend(sent_tokenize(part))

    # Step 2: Clean and lemmatize
    cleaned = []
    for s in raw_sentences:
        s_clean = s.strip() #strip removes any leading, and trailing whitespaces.
        if len(s_clean) >= 20 and not s_clean.lower().startswith(
            ("about us", "who we are", "we offer", "our company", "location:", "job type:", "similar jobs")
        ) and s_clean not in lst_of_unimportant_words:
            lemmatized = clean_and_lemmatize(s_clean)
            if len(lemmatized.split()) >= 3: #Split is used to break a delimited string into substrings
                cleaned.append(lemmatized)

    # Step 3: Deduplicate using fuzzy cosine similarity
    deduped = deduplicate_fuzzy(cleaned, threshold=0.8)

    return deduped

## preprocess_responsibility("Analyze large datasets | Write SQL queries | About Us: We love data!", lst_of_gar=["We love data!"])
 ## returns something like ["analyze large dataset", "write sql query"]

df["cleaned_responsibility"] = df["responsibilities"].apply(lambda x: preprocess_responsibility(x, lst_of_unimportant_words))


[nltk_data] Downloading package punkt to /Users/school/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/school/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/school/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Now preprocessing is done, we dive into EDA

## If a skill like "python" appears multiple times in a single job post, it should count only once for that post.

In [11]:
from collections import Counter
# Step 2: Define known skills/tools
skill_keywords = {
    "python", "sql", "r", "excel", "tableau", "power bi", "pandas", "numpy", "matplotlib",
    "tensorflow", "keras", "pytorch", "scikit-learn", "docker", "git", "airflow",
    "gcp", "aws", "azure", "linux", "bash", "hadoop", "spark", "lookml", "looker",
    "jupyter", "notebook", "jira", "powerpoint", "presentation", "mlflow", "snowflake",
    "bigquery", "databricks", "dbt"
}

# Step 3: Extract and match words
matched_skills = [] # will store all skill keywords that are found in df
for resp_list in df["cleaned_responsibility"]: # iterates through each row in DF; df["cleaned_responsibility"] is a Series where each row is a list of cleaned sentences from a single job description
    skills_in_post = set() 
    
    for sentence in resp_list:
        words = sentence.split()
        for word in words:
            if word in skill_keywords:
                skills_in_post.add(word)
                
    matched_skills.extend(skills_in_post)

# Step 4: Count matched skills
skill_counts = Counter(matched_skills)

# Step 5: Display trending skills/tools
print("Trending Skills/Tools:\n")
for skill, freq in skill_counts.most_common(20):
    print(f"{skill}: {freq}")




Trending Skills/Tools:

python: 362
sql: 305
aws: 229
azure: 157
gcp: 122
spark: 112
r: 109
airflow: 84
tableau: 84
tensorflow: 83
docker: 76
snowflake: 74
pytorch: 73
databricks: 63
presentation: 63
excel: 59
hadoop: 56
git: 54
mlflow: 36
bigquery: 34


# can build streamlit to visualize and UI above


In [12]:
from nltk.util import ngrams
all_bigrams = []

for resp_list in df["cleaned_responsibility"]:
    for sentence in resp_list:
        tokens = sentence.split()
        bigrams = list(ngrams(tokens, 2))  # 2-word combinations
        all_bigrams.extend([" ".join(bg) for bg in bigrams])


bigram_counts = Counter(all_bigrams)

# Filter out bigrams that occur less than 5 times (adjust as needed)
filtered_bigrams = {k: v for k, v in bigram_counts.items() if v >= 5}

# Show top bigrams
print("Top skill-related bigrams:")
for phrase, freq in Counter(filtered_bigrams).most_common(20):
    print(f"{phrase}: {freq}")

Top skill-related bigrams:
job salary: 1671
salary range: 1295
number job: 1114
apply job: 1108
machine learning: 919
data science: 850
privacy notice: 564
similar job: 563
job stats: 556
stats job: 556
salary number: 556
range receive: 556
receive similar: 556
job email: 556
email create: 556
create alert: 556
alert creating: 556
creating alert: 556
alert agree: 556
agree c: 556


## To filter and display the most common bigrams (2-word phrases) that contain any word from a list of relevant keywords

In [13]:
relevant_keywords = {"data", "learning", "cloud", "project", "management", "model", "analysis", "engineering"}

skill_bigrams = {k: v for k, v in filtered_bigrams.items() if any(word in k for word in relevant_keywords)}

for phrase, freq in Counter(skill_bigrams).most_common(20):
    print(f"{phrase}: {freq}")


machine learning: 919
data science: 850
data scientist: 554
data pipeline: 386
experience data: 330
data engineering: 320
data engineer: 235
data analytics: 225
data platform: 206
data product: 189
engineering team: 177
data analysis: 176
big data: 167
data management: 158
data governance: 153
software engineering: 150
learning model: 141
data quality: 136
cloud platform: 134
data visualization: 124


In [28]:
import matplotlib
import os

# Locate the built-in DejaVuSans.ttf that is compatible
font_path = os.path.join(matplotlib.get_data_path(), "fonts/ttf/DejaVuSans.ttf")
print(font_path)

from PIL import ImageFont
ImageFont.truetype(font_path, 20)  # ✅ Should not raise an error


/Users/school/opt/miniconda3/lib/python3.8/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf


<PIL.ImageFont.FreeTypeFont at 0x7ff62dd59610>

In [30]:
import matplotlib
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os

font_path = os.path.join(matplotlib.get_data_path(), "fonts/ttf/DejaVuSans.ttf")

from PIL import ImageFont
ImageFont.truetype(font_path, 20)  # Confirm font is usable

text_blob = " ".join([" ".join(x) for x in df["cleaned_responsibility"]])

wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    font_path=font_path
).generate(text_blob)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


ValueError: Only supported for TrueType fonts

In [94]:
df[0:20]['title']

0                           Intern, Data Science (Fall 2025)
1                                             Data Scientist
3                       Associate Director, Data Science FSP
5                                    Senior Backend engineer
6                                              Data Engineer
11                                     Senior MLOps Engineer
19                      Customer Experience Insights Analyst
21               Full Stack .NET Software Engineer/Developer
23                                 Staff Full Stack Engineer
24                                       Azure Data Engineer
29                 Lead Data Scientist - Feature Engineering
30    Lead Data Scientist - Articial Intelligence Specialist
31                                             Data Engineer
34                                   ETL DataStage Developer
37                            Software Engineer, Marketplace
38                                   Senior Backend Engineer
39               Senior 

In [None]:
# filter for stop words

# Use simple frequency counts or TF-IDF to find the most frequent terms and phrases (e.g., “Python”, “machine learning”, “communication skills”).

Lemmatization or stemming to group word forms.

# Filter stopwords like “the”, “and”, “with”.

# Visualize common words for quick thematic understanding.


Job title and location analysis
Group by job title or location to see how requirements vary by position or city.

In [None]:
# bag of words => change to numerical vector
# word vectors
# skip gram

# regex
# steming and lementization
# spell correction
# part of speech tagging
# 

Job Market Insights Dashboard
Problem it solves: Applicants don't know which skills are trending.

What you build:

Scrape thousands of jobs over time.

Use NLP to extract:

Most in-demand skills

Role-specific trends (e.g., ML roles vs DS roles)

Salary estimates (if available)

Visualize with a dashboard (Plotly/Dash or Streamlit).

Skills demonstrated: NLP, web scraping, data engineering, interactive dashboards, trend analysis.

Cover Letter Generator


Applicants don’t know how well they match a job.
Outputs a match score (e.g., 0-100), based on:

Skills match

Experience overlap

Role titles

Could be done with semantic similarity using sentence transformers or fine-tuned BERT.

Bonus: Explain the score ("you’re missing 3 of 5 core skills") with interpretability.