In [26]:
import streamlit as st
import re
from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from wordcloud import WordCloud

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
# Set stop words to a variable
stop_words = stopwords.words('english')
print(stop_words)
print(len(stop_words))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [41]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z]\s", " ", text)
    text = re.sub(r"\n", " ", text)
    words = word_tokenize(text)
    word = [word for word in words if word not in stop_words]
    return " ".join(word)

def get_top_keywords(text, num=10):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return Counter(words).most_common(num)

def categorize_keywords(keywords, technical_skills, soft_skills):
    tech = [word for word in keywords if word in technical_skills]
    soft = [word for word in keywords if word in soft_skills]
    other = [word for word in keywords if word not in soft_skills and word not in technical_skills]
    return tech, soft, other

def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    st.pyplot(plt)

#### Testing

In [42]:
job_description = """
Put your career in motion with a great opportunity. Work as a Smartsheet Data Analyst at GE.
Import and clean data from Excel and Smartsheet, create dashboards, collaborate with teams.
"""

resume_text = """
Experienced in Excel, Python, and data reporting. Collaborated with cross-functional teams and automated dashboards.
"""

tech_keywords = {'excel', 'python', 'sql', 'smartsheet', 'data', 'automation', 'dashboards'}
soft_keywords = {'collaboration', 'communication', 'team', 'leadership'}

# Clean and process
job_clean = clean_text(job_description)
resume_clean = clean_text(resume_text)

# Get top keywords
job_kw = [word for word, _ in get_top_keywords(job_clean)]
resume_kw = [word for word, _ in get_top_keywords(resume_clean)]

# Compare
job_set = set(job_kw)
resume_set = set(resume_kw)

missing = job_set - resume_set
match = job_set & resume_set

print("Matching Keywords:", match)
print("Missing from Resume:", missing)

# Categorize
tech, soft, other = categorize_keywords(job_kw, tech_keywords, soft_keywords)
print("\nTech Skills:", tech)
print("Soft Skills:", soft)
print("Other:", other)


Matching Keywords: {'data'}
Missing from Resume: {'work', 'opportunity', 'motion', 'put', 'ge', 'smartsheet', 'analyst', 'career', 'great'}

Tech Skills: ['smartsheet', 'data']
Soft Skills: []
Other: ['put', 'career', 'motion', 'great', 'opportunity', 'work', 'analyst', 'ge']
