In [31]:
# pip install google-generativeai

In [1]:
import os
import json
import google.generativeai as genai
import pandas as pd
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload

In [33]:
genai.configure(api_key="AIzaSyAKXCA3yKyq-STvJnsQPcefP54EshjV94M")

In [34]:
# Initialize generative AI model
model = genai.GenerativeModel("gemini-1.5-flash")

In [35]:
# Set path to your service account credentials file
SERVICE_ACCOUNT_FILE = "G:\\GAURAV\\python\\gen-lang-client-0832638864-811a793cddac.json"

# Authenticate with Google API using service account
credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=["https://www.googleapis.com/auth/drive.readonly"]
)

# Build the Drive API client
drive_service = build('drive', 'v3', credentials=credentials)

# Folder ID of the folder containing your resumes
FOLDER_ID = '1hmElAZW2syLblzejdkoxs2G7ODjK3gfU'  # Replace with your Google Drive folder ID

In [36]:
# Folder paths
resumes_folder_path = "resumes_folder"  
output_excel = "output/resumes_batch.xlsx"  # Path to output Excel file
os.makedirs(resumes_folder_path, exist_ok=True)

# Ensure output directory exists for saving Excel
os.makedirs(os.path.dirname(output_excel), exist_ok=True)

# Batch Processing

In [37]:
# Function to list all files in the folder
def list_files_in_folder(folder_id):
    query = f"'{folder_id}' in parents and trashed = false"
    results = drive_service.files().list(q=query, fields="files(id, name)").execute()
    return results.get('files', [])

In [38]:
# Function to download a file using its file ID
def download_file(file_id, file_name, download_folder):
    file_path = os.path.join(download_folder, file_name)
    request = drive_service.files().get_media(fileId=file_id)
    fh = open(file_path, 'wb')
    downloader = MediaIoBaseDownload(fh, request)
    
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print(f"Download {file_name}: {int(status.progress() * 100)}%.")
    fh.close()

# Generate prompt for accuracy and Efficiency

In [39]:
#  Function to process resume and extract information using generative AI
def process_resume(pdf_path):
    # Upload the resume to the generative model
    sample_pdf = genai.upload_file(pdf_path, mime_type="application/pdf")
    
    prompt = (
        "Extract the following details from the document: "
        "1. Name\n"
        "2. Contact details (as in the resume)\n"
        "3. University\n"
        "4. Year of Study\n"
        "5. Course\n"
        "6. Discipline\n"
        "7. CGPA/Percentage/GPA\n"
        "8. Key Skills\n"
        "9. Generative AI Experience Score (1-3 scale, where 1 = Exposed, 2 = Hands-on, 3 = Worked on advanced areas such as Agentic RAG, Evals, etc.)\n"
        "10. AI/ML Experience Score (1-3 scale, similar to above)\n"
        "11. Supporting Information: Divide into categories such as Certifications, Internships, Projects."
        "Return the details in a structured JSON format."
    )

    # Get AI model response for resume
    response = model.generate_content([prompt, sample_pdf])

    if response.candidates and len(response.candidates) > 0:
        response_text = response.candidates[0].content.parts[0].text
        
        # Attempt to parse the extracted content into JSON
        start_index = response_text.find('```json')
        if start_index != -1:
            start_index += len('```json')
            end_index = response_text.find('```', start_index)
            if end_index != -1:
                json_string = response_text[start_index:end_index].strip()
                try:
                    extracted_data = json.loads(json_string)
                    print(f"Parsed Extracted Data: {extracted_data}")
                    return extracted_data
                except json.JSONDecodeError as json_err:
                    print(f"JSON Decode Error: {json_err}. JSON string: {json_string}") 
                    return None
            else:
                print("Closing ``` not found in response.")
        else:
            print("```json not found in response.")
    else:
        print("No candidates found in response.")
    return None

In [40]:
# Function to flatten and process supporting information (e.g., certifications, internships, projects)
def flatten_data(data):
    supporting_info = {
        "Certifications": data.get("Certifications", ""),
        "Internships": data.get("Internships", ""),
        "Projects": data.get("Projects", "")
    }
    return {**data, **supporting_info}

# Save in Excel

In [41]:
# Function to save extracted data to Excel
def save_all_to_excel(all_data, output_file):
    if all_data:
        # Flatten all data and save to a dataframe
        flattened_data = [flatten_data(data) for data in all_data]
        df = pd.DataFrame(flattened_data)
        df.to_excel(output_file, index=False, engine="openpyxl")
        print(f"All data successfully saved to {output_file}")
    else:
        print("No data to save.")

In [42]:
# Main process to download, process and save all resumes in the folder
def process_all_resumes():
    # List all files in the Google Drive folder
    files_in_folder = list_files_in_folder(FOLDER_ID)

    all_extracted_data = []

    # Download each file and process it
    for file in files_in_folder:
        file_id = file['id']
        file_name = file['name']
        print(f"Downloading: {file_name}")
        
        # Download the resume file to local folder
        download_file(file_id, file_name, resumes_folder_path)
        
        # Process the downloaded resume to extract details
        resume_path = os.path.join(resumes_folder_path, file_name)
        extracted_data = process_resume(resume_path)

        if extracted_data:
            all_extracted_data.append(extracted_data)

    # Save all extracted data to Excel after processing all resumes
    save_all_to_excel(all_extracted_data, output_excel)

In [43]:
# Run the process
process_all_resumes() 

Downloading: Andrey Kurenkov.pdf
Download Andrey Kurenkov.pdf: 100%.
Parsed Extracted Data: {'Name': 'Andrey Kurenkov', 'Contact Details': {'Phone': '678-900-4326', 'Email': 'andreyvkurenkov@gmail.com', 'Website': 'www.andreykurenkov.com'}, 'University': ['Stanford University', 'Georgia Institute of Technology'], 'Year of Study': {'Stanford University': 'September 2017 - Present', 'Georgia Institute of Technology': 'August 2011 - May 2015'}, 'Course': {'Stanford University': 'M.S. in Computer Science with focus in AI', 'Georgia Institute of Technology': 'Dual major: B.S. in Electrical Engineering, B.S. in Computer Science with Research Option'}, 'Discipline': 'Computer Science', 'CGPA/Percentage/GPA': {'Stanford University': '3.87', 'Georgia Institute of Technology': 'CS 4.0, Overall 3.88'}, 'Key Skills': ['Python', 'Java', 'C', 'C++', 'R', 'MATLAB/Octave', 'LaTeX', 'Numpy', 'Scikit-learn', 'Pandas', 'ROS', 'Tensorflow', 'Docker', 'Deep Learning', 'Robotics', 'Data Science', 'AI'], 'Ge