In [5]:
from pathlib import Path
import pandas as pd

# Define the folder path where your CSVs are stored
DATA_RAW = Path(r'C:\Users\karth\OneDrive\Documents\ai_resume_ranker\data\kaggle_raw')

# Load both CSV files
resumes_path = DATA_RAW / 'clean_resume_data.csv'
jobs_path = DATA_RAW / 'jobs_dataset_with_features.csv'

df_res = pd.read_csv(resumes_path, encoding='utf-8')
df_jobs = pd.read_csv(jobs_path, encoding='utf-8')

print("Resumes shape:", df_res.shape)
print("Jobs shape:", df_jobs.shape)

# Show first few rows
display(df_res.head(3))
display(df_jobs.head(3))


Resumes shape: (2484, 3)
Jobs shape: (1615940, 2)


Unnamed: 0,ID,Category,Feature
0,16852973,HR,hr administrator marketing associate hr admini...
1,22323967,HR,hr specialist hr operations summary media prof...
2,33176873,HR,hr director summary years experience recruitin...


Unnamed: 0,Role,Features
0,Social Media Manager,5 to 15 Years Digital Marketing Specialist M.T...
1,Frontend Web Developer,"2 to 12 Years Web Developer BCA HTML, CSS, Jav..."
2,Quality Control Manager,0 to 12 Years Operations Manager PhD Quality c...


In [6]:
## Convert Resume CSV to .txt files

from pathlib import Path
import pandas as pd

ROOT = Path("..")         # if notebook is in notebooks/
RAW = ROOT / 'data'
RAW_TEXT_DIR = RAW / 'raw_resumes_text'
RAW_TEXT_DIR.mkdir(parents=True, exist_ok=True)

count = 0
for idx, row in df_res.iterrows():
    rid = str(row['ID'])
    text = str(row['Feature'] or "")
    #Light clean
    text = text.replace('\r\n', '\n').strip()
    if len(text) < 10:
        #skip extremly short ? empty entires
        continue
    fname = RAW_TEXT_DIR / f"resume_{rid}.txt"
    fname.write_text(text, encoding='utf-8')
    count += 1

print("Saved resumes:", count, "to", RAW_TEXT_DIR)

Saved resumes: 2483 to ..\data\raw_resumes_text


In [8]:
## COnvert jobs CSV to .txt files (dedupe + sample)

from pathlib import Path
import pandas as pd

ROOT = Path('..')
RAW = ROOT / 'data'
JDS_DIR = RAW / 'raw_jds'
JDS_DIR.mkdir(parents=True, exist_ok=True)

# Ensure df_jobs is loaded; otherwise reload (comment/uncomment as needed)
# df_jobs = pd.read_csv(RAW/'kaggle_raw'/'jobs_dataset_with_features.csv', encoding='utf-8')

# 1) Deduplicate by Role (keep first Features per Role)
dedup_dir = JDS_DIR / 'dedup_by_role'
dedup_dir.mkdir(parents=True, exist_ok=True)

# normalize Role strings
df_jobs['Role_clean'] = df_jobs['Role'].astype(str).str.strip().str.lower()
dedup = df_jobs.dropna(subset=['Features']).drop_duplicates(subset=['Role_clean'], keep='first')

saved = 0
for i, row in dedup.iterrows():
    role = row['Role_clean'][:80].replace('/', '_').replace(' ', '_')
    text = str(row['Features']).replace('\r\n','\n').strip()
    if len(text) < 10:
        continue
    fname = dedup_dir / f"jd_role_{i}_{role}.txt"
    fname.write_text(text, encoding='utf-8', errors='ignore')
    saved += 1

print("Saved dedup_by_role count:", saved, "to", dedup_dir)

# 2) Save a random sample of N rows for exploratory work
N = 10000   # adjust if you want fewer
sample_dir = JDS_DIR / f'sample_{N}'
sample_dir.mkdir(parents=True, exist_ok=True)

# If df_jobs is huge, sample with frac may be memory heavy; use sample with random_state
sampled = df_jobs.dropna(subset=['Features']).sample(n=min(N, len(df_jobs)), random_state=42)

saved2 = 0
for i, row in sampled.iterrows():
    role = str(row['Role'])[:60].strip().replace('/', '_').replace(' ', '_')
    text = str(row['Features']).replace('\r\n','\n').strip()
    if len(text) < 10:
        continue
    fname = sample_dir / f"jd_sample_{i}_{role}.txt"
    fname.write_text(text, encoding='utf-8', errors='ignore')
    saved2 += 1

print(f"Saved sample {saved2} job descriptions to", sample_dir)

Saved dedup_by_role count: 376 to ..\data\raw_jds\dedup_by_role
Saved sample 10000 job descriptions to ..\data\raw_jds\sample_10000


In [None]:
## Verify file counts and show one sample

from pathlib import Path
ROOT = Path('..')
print("Resumes txt count:", len(list((ROOT/'data'/'raw_resumes_text').glob('*.txt'))))
print("Jobs dedup_by_role count:", len(list((ROOT/'data'/'raw_jds'/'dedup_by_role').glob('*.txt'))))
print("Jobs sample count:", len(list((ROOT/'data'/'raw_jds'/'sample_10000').glob('*.txt'))))

# show small preview of one resume & one job
sample_resume = list((ROOT/'data'/'raw_resumes_text').glob('*.txt'))[0]
print("\n--- Resume sample filename ---\n", sample_resume.name)
print(sample_resume.read_text(encoding='utf-8')[:800])

sample_jd = list((ROOT/'data'/'raw_jds'/'sample_10000').glob('*.txt'))[0]
print("\n--- JD sample filename ---\n", sample_jd.name)
print(sample_jd.read_text(encoding='utf-8')[:800])

Resumes txt count: 2483
Jobs dedup_by_role count: 376
Jobs sample count: 10000

--- Resume sample filename ---
 resume_10001727.txt
sous chef work experience sous chef jul company name city state assisted cooks preparation green salads fruit salads pasta salads worked saut fry stations plated distributed completed dishes waiters improved accuracy filled orders changing procedure sharing tickets took inventory placed orders assisted food beverage operations front desk agent company name city state assisted property coordinator daily tasks worked hotel computer programming systems worked hr department control staffing perform employee performance evaluations handled property functions daily basis ensure best performance persistent upgrading customer service employee proficiency performance marketing property ambience income handled room reservation adjusted auditing reports received send telephone messages facsimiles fro

--- JD sample filename ---
 jd_sample_1000091_Content_Strategist.t