In [16]:
import nltk
import ssl

# Fix SSL issues
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download NLTK data
for package in ['stopwords', 'wordnet', 'punkt', 'omw-1.4', 'punkt_tab']:
    try:
        nltk.data.find(f'tokenizers/{package}') if package == 'punkt' or package == 'punkt_tab' else nltk.data.find(f'corpora/{package}')
        print(f"✓ {package} already downloaded")
    except LookupError:
        print(f"Downloading {package}...")
        nltk.download(package, quiet=False)

# Verify downloads
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
print(f"✓ Stopwords loaded: {len(stopwords.words('english'))} words")
print(f"✓ Lemmatizer initialized")


✓ stopwords already downloaded
Downloading wordnet...
✓ punkt already downloaded
Downloading omw-1.4...
✓ punkt_tab already downloaded
✓ Stopwords loaded: 198 words
✓ Lemmatizer initialized


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [1]:
# 1. Imports & Paths
import os, sys, re, pandas as pd
sys.path.append(os.path.join(os.pardir, "utils"))
from utils import preprocess_text

DATA_DIR = os.path.join(os.pardir, "data")
OLD_DATA_DIR = os.path.join(os.pardir, "data/original")
resume_path = os.path.join(OLD_DATA_DIR, "resumes.csv")
jobs_path   = os.path.join(OLD_DATA_DIR, "jobs.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2. Load raw data
resume_df = pd.read_csv(resume_path)
job_posts_df = pd.read_csv(jobs_path)

print("Resume dataset shape:", resume_df.shape)
print("Job postings dataset shape:", job_posts_df.shape)

display(resume_df.head())
display(job_posts_df.head())

Resume dataset shape: (2484, 4)
Job postings dataset shape: (19001, 24)


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True


In [3]:
# 3. Pick text columns safely (fallbacks)
def pick_text_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    raise KeyError(f"None of the expected columns {candidates} found in df columns: {df.columns.tolist()}")

resume_text_col = pick_text_col(resume_df, ["Resume_str", "Resume", "resume_text", "text"])
job_text_col    = pick_text_col(job_posts_df, ["job_text", "JobDescription", "job_desc", "description", "text"])


In [4]:
# 4. Drop empty / NA text rows
condition = job_posts_df.isnull().sum() != 0
columnlist = list(job_posts_df.isnull().sum()[condition].index)
print("Columns with missing values:", columnlist)
print(job_posts_df[columnlist].dtypes)

Columns with missing values: ['Title', 'Company', 'AnnouncementCode', 'Term', 'Eligibility', 'Audience', 'StartDate', 'Duration', 'Location', 'JobDescription', 'JobRequirment', 'RequiredQual', 'Salary', 'ApplicationP', 'OpeningDate', 'Deadline', 'Notes', 'AboutC', 'Attach']
Title               object
Company             object
AnnouncementCode    object
Term                object
Eligibility         object
Audience            object
StartDate           object
Duration            object
Location            object
JobDescription      object
JobRequirment       object
RequiredQual        object
Salary              object
ApplicationP        object
OpeningDate         object
Deadline            object
Notes               object
AboutC              object
Attach              object
dtype: object


In [5]:
columns_to_drop = ['AnnouncementCode', 'Term', 'Eligibility', 'Audience', 
                   'StartDate', 'Duration', 'OpeningDate', 'Deadline', 
                   'Notes', 'Attach']
job_posts_df = job_posts_df.drop(columns=columns_to_drop, errors='ignore')
print("Remaining columns after dropping:", job_posts_df.columns.tolist())


Remaining columns after dropping: ['jobpost', 'date', 'Title', 'Company', 'Location', 'JobDescription', 'JobRequirment', 'RequiredQual', 'Salary', 'ApplicationP', 'AboutC', 'Year', 'Month', 'IT']


In [6]:
columns_to_check = ['Title', 'JobDescription', 'JobRequirment', 
                    'Company', 'Location', 'RequiredQual', 
                    'Salary', 'AboutC']
job_posts_df = job_posts_df.dropna(subset=columns_to_check)
print("Shape after dropping null rows:", job_posts_df.shape)

Shape after dropping null rows: (5459, 14)


In [7]:
# Remove duplicate rows from both datasets
job_posts_df = job_posts_df.drop_duplicates()
resume_df = resume_df.drop_duplicates()

print("Remaining duplicate job posts:", job_posts_df.duplicated().sum())
print("Remaining duplicate resumes:", resume_df.duplicated().sum())

Remaining duplicate job posts: 0
Remaining duplicate resumes: 0


In [8]:
def has_html(text):
    if isinstance(text, str):
        return bool(re.search(r'<.*?>', text))
    return False

columns_with_html = [col for col in resume_df.columns if resume_df[col].apply(has_html).any()]
print("Columns with HTML tags in resumes:", columns_with_html)

def clean_html(text):
    return re.sub('<[^<]+?>', '', text)

resume_df['Resume_str'] = resume_df['Resume_str'].apply(clean_html)
print("HTML cleaned from Resume_str column.")


Columns with HTML tags in resumes: ['Resume_str', 'Resume_html']
HTML cleaned from Resume_str column.


In [9]:
job_posts_df = job_posts_df.reset_index(drop=True)

# Combine all relevant text fields into a single column for job descriptions
job_posts_df["job_text"] = (
    "Description: " + job_posts_df["JobDescription"].fillna('') + " " +
    "Requirements: " + job_posts_df["JobRequirment"].fillna('') + " " +
    "Qualifications: " + job_posts_df["RequiredQual"].fillna('') + " " +
    "About Company: " + job_posts_df["AboutC"].fillna('')
)

print("Combined text column 'job_text' created successfully.")
display(job_posts_df[["Title", "job_text"]].head())

Combined text column 'job_text' created successfully.


Unnamed: 0,Title,job_text
0,Secretary / office helper,"Description: Secretary office helper, Typing t..."
1,Cheif Accountant,Description: Keeping complete accounting of th...
2,Sales representative,Description: Distribution of cosmetic and laun...
3,Research Analyst,Description: Japonica Intersectoral is seeking...
4,Procurement Specialist,Description: - Developing Procurement Plans (b...


In [17]:
resume_df["Resume_clean"] = resume_df["Resume_str"].apply(preprocess_text)
job_posts_df["job_text_clean"] = job_posts_df["job_text"].apply(preprocess_text)

In [18]:
# 7. Save cleaned outputs
out_resume = os.path.join(DATA_DIR, "resumes_cleaned.csv")
out_jobs   = os.path.join(DATA_DIR, "jobs_cleaned.csv")
resume_df.to_csv(out_resume, index=False)
job_posts_df.to_csv(out_jobs, index=False)

print("Saved:", out_resume, "and", out_jobs)

Saved: ..\data\resumes_cleaned.csv and ..\data\jobs_cleaned.csv
