In [18]:
from tarfile import version

import streamlit as st
import re
from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from wordcloud import WordCloud

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
# Set stop words to a variable
stop_words = stopwords.words('english')
print(stop_words)
print(len(stop_words))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [20]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z]\s", " ", text)
    text = re.sub(r"\n", " ", text)
    words = word_tokenize(text)
    word = [word for word in words if word not in stop_words]
    return " ".join(word)

def get_top_keywords(text, num=10):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return Counter(words).most_common(num)

def categorize_keywords(keywords, technical_skills, soft_skills):
    tech = [word for word in keywords if word in technical_skills]
    soft = [word for word in keywords if word in soft_skills]
    other = [word for word in keywords if word not in soft_skills and word not in technical_skills]
    return tech, soft, other

def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.title(title)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    #st.pyplot(plt)

#### Testing

In [21]:
job_description = """
Put your career in motion with a great opportunity. Work as a Smartsheet Data Analyst at GE.
Import and clean data from Excel and Smartsheet, create dashboards, collaborate with teams.
"""

resume_text = """
Experienced in Excel, Python, and data reporting. Collaborated with cross-functional teams and automated dashboards.
"""

tech_keywords = {'excel', 'python', 'sql', 'smartsheet', 'data', 'automation', 'dashboards'}
soft_keywords = {'collaboration', 'communication', 'team', 'leadership'}

# Clean and process
job_clean = clean_text(job_description)
resume_clean = clean_text(resume_text)

# Get top keywords
job_kw = [word for word, _ in get_top_keywords(job_clean)]
resume_kw = [word for word, _ in get_top_keywords(resume_clean)]

# Compare
job_set = set(job_kw)
resume_set = set(resume_kw)

missing = job_set - resume_set
match = job_set & resume_set

print("Matching Keywords:", match)
print("Missing from Resume:", missing)

# Categorize
tech, soft, other = categorize_keywords(job_kw, tech_keywords, soft_keywords)
print("\nTech Skills:", tech)
print("Soft Skills:", soft)
print("Other:", other)


Matching Keywords: {'data'}
Missing from Resume: {'great', 'motion', 'ge', 'analyst', 'career', 'smartsheet', 'work', 'opportunity', 'put'}

Tech Skills: ['smartsheet', 'data']
Soft Skills: []
Other: ['put', 'career', 'motion', 'great', 'opportunity', 'work', 'analyst', 'ge']


In [22]:
generate_wordcloud(resume_text,title = 'test')

In [25]:
string1 = set(['i','love','you'])
string2 = set(['i'])

string1 - string2

{'love', 'you'}

In [34]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk import pos_tag
from nltk.util import ngrams
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# Set up
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


# Define your helper functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\n", " ", text)
    return text

def tokenize_and_filter(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tagged = pos_tag(tokens)
    filtered = [lemmatizer.lemmatize(word) for word,tag in tagged if tag.startswith('N') or tag.startswith('V')]

    bigrams = [" ".join(gram) for gram in ngrams(filtered, 2)]
    trigrams = [" ".join(gram) for gram in ngrams(filtered, 3)]

    return filtered + bigrams + trigrams
def get_top_keywords(tokens, num=50):
    return Counter(tokens).most_common(num)

# Example job description
job_description = """
We are looking for a data analyst with strong communication and leadership skills.
The candidate must have experience with Python, Excel, and data visualization tools.
"""

# Clean and process
cleaned = clean_text(job_description)
job_tokens = tokenize_and_filter(cleaned)
jop_top = get_top_keywords(job_tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jahdovanterpool/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [35]:
print(jop_top)

[('data', 2), ('looking', 1), ('analyst', 1), ('communication', 1), ('leadership', 1), ('skill', 1), ('candidate', 1), ('experience', 1), ('excel', 1), ('visualization', 1), ('tool', 1), ('looking data', 1), ('data analyst', 1), ('analyst communication', 1), ('communication leadership', 1), ('leadership skill', 1), ('skill candidate', 1), ('candidate experience', 1), ('experience excel', 1), ('excel data', 1), ('data visualization', 1), ('visualization tool', 1), ('looking data analyst', 1), ('data analyst communication', 1), ('analyst communication leadership', 1), ('communication leadership skill', 1), ('leadership skill candidate', 1), ('skill candidate experience', 1), ('candidate experience excel', 1), ('experience excel data', 1), ('excel data visualization', 1), ('data visualization tool', 1)]
