In [69]:
import os
import re
import requests
import sys
import json
import google.generativeai as genai
import pandas as pd
from pdfminer.high_level import extract_text
from docx import Document

In [70]:
# Function to preprocess the extracted text
def preprocess_text(text):
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove any non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    return text

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    return extract_text(file_path)

# Function to extract text from a DOCX file
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Function to parse the extracted text
def parse_extracted_text(text):
    details = {}
    lines = text.strip().split('\n')
    for line in lines:
        if ':**' in line:
            key, value = line.split(':**', 1)
            key = key.replace('* **', '').strip()
            value = value.replace('**', '').strip()
            details[key] = value
    return details


In [71]:
# Directory containing the resumes
directory = "Resumes/"

# Initialize a list to hold all extracted details
all_extracted_details = []


In [72]:
# Configure Google Generative AI
genai.configure(api_key="YOUR_API_KEY")
model = genai.GenerativeModel(model_name='gemini-1.5-flash')

In [73]:
import os
import pandas as pd
import io

# Initialize an empty list to store all extracted details
all_extracted_details = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    # Check if the file is a PDF or DOCX
    if filename.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif filename.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    else:
        print(f"Unsupported file type for file: {filename}. Skipping.")
        continue

    # Preprocess the extracted text
    processed_text = preprocess_text(text)

    prompt = f"""
    Extract the following information from the resume: 
    - Name
    - Email
    - Phone
    - Date of Birth (DOB)
    - College
    - City
    - Stream of Work
    - Top Skills (as a single line string, separated by commas)
    - Experience

    If any field is unavailable, use 'NA'. Present the results in this uniform format:
    {{
        "Name": "<Name>",
        "Email": "<Email>",
        "Phone": "<Phone>",
        "DOB": "<DOB>",
        "College": "<College>",
        "City": "<City>",
        "Stream of Work": "<Stream of Work>",
        "Top Skills": "<Skill1>, <Skill2>, <Skill3>, <Skill4>, <Skill5>, <Skill6>, <Skill7>, <Skill8>, <Skill9>, <Skill10>",
        "Experience": "<Experience>"
    }}
    """

    # Create a StringIO object to capture the model's response
    captured_output = io.StringIO()
    
    # Generate response from the model
    response = model.generate_content([processed_text, prompt])
    
    # Capture the model response
    print(response.text.strip(), file=captured_output)  # Capture output in the StringIO object
    captured_response_text = captured_output.getvalue()  # Get the captured text
    print(f"Raw model response for {filename}:\n{captured_response_text}\n")  # Print the raw response
    
    # Initialize default extracted details
    extracted_details = {
        "Name": "NA",
        "Email": "NA",
        "Phone": "NA",
        "DOB": "NA",
        "College": "NA",
        "City": "NA",
        "Stream of Work": "NA",
        "Top Skills": "NA",  # Store as a single string
        "Experience": "NA"
    }
    
    # Only proceed if the captured response text is not empty
    if captured_response_text:
        # Split the captured text by lines and manually extract values
        lines = captured_response_text.strip().splitlines()
        for line in lines:
            if "Name" in line:
                extracted_details["Name"] = line.split(":")[-1].strip().strip('"')
            elif "Email" in line:
                extracted_details["Email"] = line.split(":")[-1].strip().strip('"')
            elif "Phone" in line:
                extracted_details["Phone"] = line.split(":")[-1].strip().strip('"')
            elif "DOB" in line:
                extracted_details["DOB"] = line.split(":")[-1].strip().strip('"')
            elif "College" in line:
                extracted_details["College"] = line.split(":")[-1].strip().strip('"')
            elif "City" in line:
                extracted_details["City"] = line.split(":")[-1].strip().strip('"')
            elif "Stream of Work" in line:
                extracted_details["Stream of Work"] = line.split(":")[-1].strip().strip('"')
            elif "Top Skills" in line:
                # Extract the skills as a single string
                extracted_details["Top Skills"] = line.split(":")[-1].strip().strip('"')
            elif "Experience" in line:
                extracted_details["Experience"] = line.split(":")[-1].strip().strip('"')
    
    # Append the extracted details to the list
    all_extracted_details.append(extracted_details)


Unsupported file type for file: .ipynb_checkpoints. Skipping.
Raw model response for Chitturi Prasad.pdf:
```json
{
    "Name": "Chitturi Prasad",
    "Email": "chprasad@kluniversity.in",
    "Phone": "9700674515",
    "DOB": "NA",
    "College": "KL University",
    "City": "NA",
    "Stream of Work": "Assistant Professor",
    "Top Skills": "Java, JSP, HTML, CSS, JavaScript, C, C++, SQL, Python, Programming",
    "Experience": "Working as Assistant Professor in KL University from July 2019.\nWorked as Assistant Professor in Gudlavalleru Engineering College, Gudlavalleru from August 2018 to June 2019.\nWorked as Assistant Professor in Priyadarshini Institute of Technology and Science for Women, Chintha- lapudi from July 2017 to July 2018."
}
```


Unsupported file type for file: extracted_resume_details.csv. Skipping.
Raw model response for JATINVARLYANI.pdf:
```json
{
    "Name": "Jatin Varlyani",
    "Email": "varlyanijatin88@gmail.com",
    "Phone": "+91 8806502484",
    "DOB": "NA

In [74]:
# Convert the list of extracted details into a DataFrame
df = pd.DataFrame(all_extracted_details)

# Display the DataFrame
print(df)

# Optionally, save the DataFrame to a CSV file
df.to_csv("Resumes/Sample/extracted_resume_details.csv", index=False)

                 Name                             Email             Phone  \
0   Chitturi Prasad",        chprasad@kluniversity.in",      9700674515",   
1    Jatin Varlyani",       varlyanijatin88@gmail.com",  +91 8806502484",   
2       Junhao Dong",         junhao.dong96@gmail.com",  (678) 343-1817",   
3      Kumar Saurav",  kumarsauravsmart2010@gmail.com",  +91-8420538839",   
4  Rachelle Beaudry",       hello@reallygreatsite.com",    123-456-7890",   
5    SAGAR BHANDARI",     sbhandari1@stcloudstate.edu",              NA",   

    DOB                                            College           City  \
0  NA",  Working as Assistant Professor in KL Universit...           NA",   
1  NA",         Veermata Jijabai Technological Institute",       Mumbai",   
2  NA",                          Northeastern University",   Boston, MA",   
3  NA",  Indian Institute of Engineering Science and Te...      Kolkata",   
4  NA",  University of Finance and Management, City Col...     Any City",  