In [None]:
# 1. Imports & Paths
import importlib.util
import os, sys, re, pandas as pd
sys.path.append(os.path.join(os.pardir, "utils"))

DATA_DIR = os.path.join(os.pardir, "data")
resume_path = os.path.join(DATA_DIR, "resumes_cleaned.csv")
jobs_path   = os.path.join(DATA_DIR, "jobs_cleaned.csv")

In [None]:
# 2. Load Data
resume_df = pd.read_csv(resume_path)
job_posts_df = pd.read_csv(jobs_path)

In [None]:
%load_ext autoreload
%autoreload 2
from utils import *


In [None]:
resume_lengths = resume_df['Resume_str'].str.len()
job_lengths = job_posts_df['JobDescription'].str.len()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.hist(resume_lengths, bins=50, alpha=0.7)
ax1.set_title('Resume Text Length Distribution')
ax1.set_xlabel('Character Count')

ax2.hist(job_lengths, bins=50, alpha=0.7)
ax2.set_title('Job Description Length Distribution')
ax2.set_xlabel('Character Count')
plt.show()

In [None]:
# Get most common words in resumes and job postings
def get_top_words(text_series, n=45):
    all_words = ' '.join(text_series).lower().split()
    # Remove stopwords and short words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in all_words if word not in stop_words and len(word) > 2]
    return Counter(filtered_words).most_common(n)

top_resume_words = get_top_words(resume_df['Resume_str'])
top_job_words = get_top_words(job_posts_df['JobDescription'])
print(top_resume_words)
print(top_job_words)
# Plot word frequencies
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

words, counts = zip(*top_resume_words)
ax1.barh(words, counts)
ax1.set_title('Top Words in Resumes')

words, counts = zip(*top_job_words)
ax2.barh(words, counts)
ax2.set_title('Top Words in Job Descriptions')
plt.tight_layout()
plt.show()

In [None]:
# Common skills extraction (simplified)
# Add/update list with keywords we are interested in
skills_keywords = ['python', 'java', 'sql', 'machine learning', 'aws',
                   'docker', 'kubernetes', 'react', 'node.js', 'tensorflow']

def count_skills(text, skills_list):
    text_lower = text.lower()
    return sum(1 for skill in skills_list if skill in text_lower)

# Count skills in resumes and job postings
for skill in skills_keywords:
    resume_df[f'resume_has_{skill}'] = resume_df['Resume_str'].str.lower().str.contains(skill)
    job_posts_df[f'job_has_{skill}'] = job_posts_df['JobDescription'].str.lower().str.contains(skill)

# Plot skills frequency
resume_skills_count = resume_df[[f'resume_has_{skill}' for skill in skills_keywords]].sum()
job_skills_count = job_posts_df[[f'job_has_{skill}' for skill in skills_keywords]].sum()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
resume_skills_count.plot(kind='barh', ax=ax1)
ax1.set_title('Skills Frequency in Resumes')
job_skills_count.plot(kind='barh', ax=ax2)
ax2.set_title('Skills Frequency in Job Postings')
plt.tight_layout()
plt.show()

In [None]:
# Bigram analysis
def plot_top_ngrams(text_series, n=2, top_k=45):
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english',
                               max_features=top_k)
    X = vectorizer.fit_transform(text_series)
    words = vectorizer.get_feature_names_out()
    counts = X.sum(axis=0).A1

    plt.figure(figsize=(10, 6))
    plt.barh(words, counts)
    plt.title(f'Top {n}-grams')
    plt.show()

# Compare bigrams in resumes vs job postings
plot_top_ngrams(resume_df['Resume_str'], n=2)
plot_top_ngrams(job_posts_df['JobDescription'], n=2)

In [None]:
def count_keywords_in_series(text_series, domain_keywords):
    """Counts all domain keywords in a pandas Series of text efficiently."""
    domain_counts = {domain: 0 for domain in domain_keywords}
    keyword_counts = Counter()
    
    for domain, keywords in domain_keywords.items():
        for kw in keywords:
            # Build a regex for full word or phrase match (case-insensitive)
            pattern = rf'\b{re.escape(kw.lower())}\b'
            # Sum counts across all rows
            count = text_series.str.count(pattern, flags=re.IGNORECASE).sum()
            if count > 0:
                domain_counts[domain] += count
                keyword_counts[(domain, kw)] += count
                
    return domain_counts, keyword_counts

resume_domain_counts, resume_kw_counts = count_keywords_in_series(resume_df['Resume_str'], domain_keywords)
job_domain_counts, job_kw_counts = count_keywords_in_series(job_posts_df['job_text'], domain_keywords)


domain_summary = pd.DataFrame({
    'domain': list(domain_keywords.keys()),
    'resume_keyword_count': [resume_domain_counts[d] for d in domain_keywords],
    'job_keyword_count': [job_domain_counts[d] for d in domain_keywords]
}).sort_values(by='resume_keyword_count', ascending=False)

# Optional: detailed keyword-level breakdown
resume_kw_df = pd.DataFrame(resume_kw_counts.items(), columns=['(domain, keyword)', 'resume_count'])
job_kw_df = pd.DataFrame(job_kw_counts.items(), columns=['(domain, keyword)', 'job_count'])




In [None]:
# Show top 10 keywords by count
print("=== Resume Keyword Counts (Top 30) ===")
display(resume_kw_df.sort_values('resume_count', ascending=False).head(30))

print("=== Job Keyword Counts (Top 30) ===")
display(job_kw_df.sort_values('job_count', ascending=False).head(30))



In [None]:

print("=== Domain-Level Keyword Summary ===")
print(domain_summary, "\n")

print("=== Sample Keyword-Level Breakdown (Top 10) ===")
print(resume_kw_df.sort_values('resume_count', ascending=False))

In [None]:
kw_compare = pd.merge(
    resume_kw_df, job_kw_df,
    on='(domain, keyword)', how='outer'
).fillna(0)

kw_compare['difference'] = kw_compare['resume_count'] - kw_compare['job_count']

# Show top 20 overrepresented in resumes
kw_compare.sort_values('difference', ascending=False).head(20)
