In [None]:
import os
import pandas as pd
from pdfminer.high_level import extract_text
from docx import Document  
import win32com.client  

In [None]:
current_dir = os.path.dirname(os.path.abspath("__file__"))

resume_csv_path = os.path.join(current_dir, "..", "Resume-Dataset", "Resume", "Resume.csv")
job_csv_path = os.path.join(current_dir, "..", "Resume-Dataset", "training_data.csv")
resumes_dir = os.path.join(current_dir, "..", "Resume-Dataset", "data", "data")

resume_info = pd.read_csv(resume_csv_path)  
job_descriptions = pd.read_csv(job_csv_path)

In [None]:
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_doc(doc_path):
    word = win32com.client.Dispatch("Word.Application")
    word.visible = False
    doc = word.Documents.Open(doc_path)
    text = doc.Content.Text
    doc.Close()
    word.Quit()
    return text

def find_resume_files(directory):
    resume_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf") or file.endswith(".doc") or file.endswith(".docx"):
                resume_files.append(os.path.join(root, file))
    return resume_files

In [None]:
resume_files = find_resume_files(resumes_dir)

resume_ids_in_csv = set(resume_info['ID'].astype(str)) 

extracted_texts = {}
for resume_path in resume_files:
    resume_id = os.path.splitext(os.path.basename(resume_path))[0]

    if resume_id not in resume_ids_in_csv:
        print(f"Skipping file {resume_path} (ID not in Resume.csv)")
        continue

    try:
        if resume_path.endswith(".pdf"):
            text = extract_text_from_pdf(resume_path)
        elif resume_path.endswith(".docx"):
            text = extract_text_from_docx(resume_path)
        elif resume_path.endswith(".doc"):
            text = extract_text_from_doc(resume_path)
        else:
            print(f"Unsupported file format: {resume_path}")
            continue
        
        extracted_texts[resume_id] = text
    except Exception as e:
        print(f"Error processing file {resume_path}: {e}")



In [None]:
output_dir = os.path.join(current_dir, "..", "outputs", "extracted_texts")
os.makedirs(output_dir, exist_ok=True)

for resume_id, text in extracted_texts.items():
    file_path = os.path.join(output_dir, f'{resume_id}.txt')
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

print(f"Text extraction completed. {len(extracted_texts)} resumes processed and saved in '{output_dir}'.")

Text extraction completed. 2457 resumes processed and saved in 'c:\Users\karun\OneDrive\Documents\Brain Inspired AI Project\src\..\outputs\extracted_texts'.
