In [None]:
# 1. Imports & Paths
import os, sys, re, pandas as pd
sys.path.append(os.path.join(os.pardir, "utils"))
from utils import preprocess_text

DATA_DIR = os.path.join(os.pardir, "data")
OLD_DATA_DIR = os.path.join(os.pardir, "data/original")
resume_path = os.path.join(OLD_DATA_DIR, "resumes.csv")
jobs_path   = os.path.join(OLD_DATA_DIR, "jobs.csv")

In [None]:
# 2. Load raw data
resume_df = pd.read_csv(resume_path)
job_posts_df = pd.read_csv(jobs_path)

print("Resume dataset shape:", resume_df.shape)
print("Job postings dataset shape:", job_posts_df.shape)

display(resume_df.head())
display(job_posts_df.head())

In [None]:
# 3. Pick text columns safely (fallbacks)
def pick_text_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    raise KeyError(f"None of the expected columns {candidates} found in df columns: {df.columns.tolist()}")

resume_text_col = pick_text_col(resume_df, ["Resume_str", "Resume", "resume_text", "text"])
job_text_col    = pick_text_col(job_posts_df, ["job_text", "JobDescription", "job_desc", "description", "text"])


In [None]:
# 4. Drop empty / NA text rows
condition = job_posts_df.isnull().sum() != 0
columnlist = list(job_posts_df.isnull().sum()[condition].index)
print("Columns with missing values:", columnlist)
print(job_posts_df[columnlist].dtypes)

In [None]:
columns_to_drop = ['AnnouncementCode', 'Term', 'Eligibility', 'Audience', 
                   'StartDate', 'Duration', 'OpeningDate', 'Deadline', 
                   'Notes', 'Attach']
job_posts_df = job_posts_df.drop(columns=columns_to_drop, errors='ignore')
print("Remaining columns after dropping:", job_posts_df.columns.tolist())


In [None]:
columns_to_check = ['Title', 'JobDescription', 'JobRequirment', 
                    'Company', 'Location', 'RequiredQual', 
                    'Salary', 'AboutC']
job_posts_df = job_posts_df.dropna(subset=columns_to_check)
print("Shape after dropping null rows:", job_posts_df.shape)

In [None]:
# Remove duplicate rows from both datasets
job_posts_df = job_posts_df.drop_duplicates()
resume_df = resume_df.drop_duplicates()

print("Remaining duplicate job posts:", job_posts_df.duplicated().sum())
print("Remaining duplicate resumes:", resume_df.duplicated().sum())

In [None]:
def has_html(text):
    if isinstance(text, str):
        return bool(re.search(r'<.*?>', text))
    return False

columns_with_html = [col for col in resume_df.columns if resume_df[col].apply(has_html).any()]
print("Columns with HTML tags in resumes:", columns_with_html)

def clean_html(text):
    return re.sub('<[^<]+?>', '', text)

resume_df['Resume_str'] = resume_df['Resume_str'].apply(clean_html)
print("HTML cleaned from Resume_str column.")


In [None]:
job_posts_df = job_posts_df.reset_index(drop=True)

# Combine all relevant text fields into a single column for job descriptions
job_posts_df["job_text"] = (
    "Description: " + job_posts_df["JobDescription"].fillna('') + " " +
    "Requirements: " + job_posts_df["JobRequirment"].fillna('') + " " +
    "Qualifications: " + job_posts_df["RequiredQual"].fillna('') + " " +
    "About Company: " + job_posts_df["AboutC"].fillna('')
)

print("Combined text column 'job_text' created successfully.")
display(job_posts_df[["Title", "job_text"]].head())

In [None]:
resume_df["Resume_clean"] = resume_df["Resume_str"].apply(preprocess_text)
job_posts_df["job_text_clean"] = job_posts_df["job_text"].apply(preprocess_text)

In [None]:
# 7. Save cleaned outputs
out_resume = os.path.join(DATA_DIR, "resumes_cleaned.csv")
out_jobs   = os.path.join(DATA_DIR, "jobs_cleaned.csv")
resume_df.to_csv(out_resume, index=False)
job_posts_df.to_csv(out_jobs, index=False)

print("Saved:", out_resume, "and", out_jobs)