In [8]:
# Import necessary libraries
import os
import pickle
import pandas as pd
import docx2txt
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize spacy and lemmatizer
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()

# Set paths
resume_folder = 'resumes/'
job_description_path = 'resumes/job_description.txt'  # Assuming the correct path

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ''
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:  # Using direct iteration over pages
            text += page.extract_text()  # Updated method to extract text
        if text.strip() == '':  # If no text extracted, fall back to OCR
            try:
                images = convert_from_path(pdf_path)
                for image in images:
                    text += pytesseract.image_to_string(image)
            except Exception as e:
                print(f"Error during OCR processing of {pdf_path}: {e}")
    return text

# Function to extract text from Word documents
def extract_text_from_word(doc_path):
    try:
        return docx2txt.process(doc_path)
    except Exception as e:
        print(f"Error reading {doc_path}: {e}")
        return ''

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Read and preprocess job description
with open(job_description_path, 'r') as file:
    job_description = file.read()
preprocessed_job_description = preprocess_text(job_description)

# Process resumes
resume_data = []
for filename in os.listdir(resume_folder):
    filepath = os.path.join(resume_folder, filename)
    if filename.endswith('.pdf'):
        text = extract_text_from_pdf(filepath)
    elif filename.endswith('.docx') or filename.endswith('.doc'):
        text = extract_text_from_word(filepath)
    else:
        print(f"Unsupported file format: {filename}")
        continue
    preprocessed_text = preprocess_text(text)
    resume_data.append({
        'Candidate': filename,
        'Raw Text': text,
        'Preprocessed Text': preprocessed_text
    })

# Save preprocessed data for modeling
resume_df = pd.DataFrame(resume_data)
resume_df.to_pickle('env/preprocessed_resumes.pkl')
with open('env/preprocessed_job_description.pkl', 'wb') as f:
    pickle.dump(preprocessed_job_description, f)

print("Data loading and preprocessing completed successfully.")


[nltk_data] Downloading package punkt to /Users/ospc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ospc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ospc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unsupported file format: job_description.txt
Data loading and preprocessing completed successfully.
