In [None]:
!pip install pdfplumber gspread gspread-dataframe


In [180]:
import pdfplumber
import pandas as pd
from google.colab import auth
from google.auth import default
import gspread
from gspread_dataframe import set_with_dataframe
import re


In [181]:
# 1. Authenticate to access Google Sheets API
auth.authenticate_user()
creds, _ = default()
client = gspread.authorize(creds)


In [185]:
# 2. Helper function to extract text and layout information
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text_data = []
        for page in pdf.pages:
            words = page.extract_words()
            for word in words:
                text_data.append(word)
    return text_data


In [187]:
resume_template = "Resume1.pdf"
extract_text_from_pdf(resume_template)

[{'text': 'Samira',
  'x0': 234.632568,
  'x1': 301.767168,
  'top': 52.902000000000044,
  'doctop': 52.902000000000044,
  'bottom': 70.90200000000004,
  'upright': True,
  'height': 18.0,
  'width': 67.13460000000003,
  'direction': 'ltr'},
 {'text': 'Alipour',
  'x0': 306.33196799999996,
  'x1': 377.363568,
  'top': 52.902000000000044,
  'doctop': 52.902000000000044,
  'bottom': 70.90200000000004,
  'upright': True,
  'height': 18.0,
  'width': 71.03160000000003,
  'direction': 'ltr'},
 {'text': 'Phone',
  'x0': 142.264632,
  'x1': 174.108408,
  'top': 72.42048,
  'doctop': 72.42048,
  'bottom': 83.46047999999996,
  'upright': True,
  'height': 11.039999999999964,
  'width': 31.84377599999999,
  'direction': 'ltr'},
 {'text': '|',
  'x0': 177.146616,
  'x1': 180.01701599999998,
  'top': 72.42048,
  'doctop': 72.42048,
  'bottom': 83.46047999999996,
  'upright': True,
  'height': 11.039999999999964,
  'width': 2.8703999999999894,
  'direction': 'ltr'},
 {'text': 'samiraalipour1989@gma

In [194]:
# 3. update the resume reader for multiple ones
resume_templates = ["Resume1.pdf", "Resume2.pdf"]

def extract_text_from_multiple_pdfs(pdf_files):
    all_text_data = {}
    for pdf_file in pdf_files:
        with pdfplumber.open(pdf_file) as pdf:
            text_data = []
            for page in pdf.pages:
                words = page.extract_words()
                for word in words:
                    text_data.append(word)
            all_text_data[pdf_file] = text_data
    return all_text_data


In [190]:
# 4. Add keywords for detecting sections
summary_keywords = ["About", "Summary", "Profile", "Objective", "Overview"]
education_keywords = ["Education", "Degree", "University", "School"]
profession_keywords = ["Experience", "Work", "Employment", "Job", "Position"]
skills_keywords = ["Skills", "Tools"]


In [191]:
# 5. sectionize the resumes
def extract_resume_sections(text_data):

    section_dict = {
        "Name": "",
        "Title": "",
        "Email": "",
        "Phone": "",
        "LinkedIn": "",
        "GitHub": "",
        "Summary": "",
        "Education": "",
        "Profession": "",
        "Skills": "",

    }

    # Combine all tokens into one content block for easier regex pattern matching
    content = " ".join([word['text'] for word in text_data])

    # Use regex to extract email and phone, but remove them from content for summary detection
    email_match = re.search(r'\S+@\S+', content)
    phone_match = re.search(r'\+?\d[\d\s-]{7,}', content)
    linkedin_match = re.search(r'linkedin\.com/\S+', content)
    github_match = re.search(r'github\.com/\S+', content)

    # Extract email, phone, LinkedIn, and GitHub
    if email_match:
        section_dict["Email"] = email_match.group()
        content = content.replace(section_dict["Email"], '')  # Remove from content

    if phone_match:
        section_dict["Phone"] = phone_match.group()
        content = content.replace(section_dict["Phone"], '')  # Remove from content

    if linkedin_match:
        section_dict["LinkedIn"] = linkedin_match.group()
        content = content.replace(section_dict["LinkedIn"], '')  # Remove from content

    if github_match:
        section_dict["GitHub"] = github_match.group()
        content = content.replace(section_dict["GitHub"], '')  # Remove from content

    # Remove any URLs from the content to avoid polluting the summary
    content = re.sub(r'http\S+', '', content)
    content = re.sub(r'www\.\S+', '', content)

    # Detect name using a heuristic (first line, or if capitalized and not an email/phone)
    words = content.split()
    section_dict["Name"] = words[0] + " " + words[1]  # Simplified assumption for name extraction

    if len(words) > 2:
        section_dict["Title"] = words[2]

    # Track the current section
    current_section = "Summary"

    # Iterate over words to detect sections and avoid adding phone or email in the summary
    for word in words[3:]:
        if any(edu_kw in word for edu_kw in education_keywords):
            current_section = "Education"
        elif any(prof_kw in word for prof_kw in profession_keywords):
            current_section = "Profession"
        elif any(skill_kw in word for skill_kw in skills_keywords):
            current_section = "Skills"
        elif any(sum_kw in word for sum_kw in summary_keywords):
            current_section = "Summary"

        # Avoid adding phone/email/URL in the summary
        if current_section == "Summary" and (re.match(r'\+?\d[\d\s-]{7,}', word) or '@' in word or 'http' in word):
            continue

        # Append text to the current section
        section_dict[current_section] += word + " "

    # Clean up extra spaces
    for key in section_dict:
        section_dict[key] = section_dict[key].strip()

    return section_dict


In [192]:
# 6. write on the sheet
def process_resumes_and_write_to_sheet(pdf_files):
    all_section_dicts = []

    # Extract text and process each resume
    text_data_all_resumes = extract_text_from_multiple_pdfs(pdf_files)

    for pdf_file, text_data in text_data_all_resumes.items():
        sections = extract_resume_sections(text_data)
        all_section_dicts.append({"Resume": pdf_file, "Sections": sections})

    # Prepare data for row-wise saving
    rows = []
    headers = ["Name", "Email", "Phone", "LinkedIn", "GitHub", "Summary", "Education", "Profession", "Skills"]

    for resume_data in all_section_dicts:
        sections = resume_data["Sections"]

        # Ensure phone is formatted as string to prevent issues with Google Sheets
        phone_number = sections.get("Phone", "")
        if phone_number:
            phone_number = str(phone_number)
            # Format with ="<phone_number>" to force text treatment
            phone_number = f'="{phone_number}"'


        # Create hyperlinks for LinkedIn and GitHub
        linkedin_hyperlink = f'=HYPERLINK("{sections.get("LinkedIn", "")}", "LinkedIn")' if sections.get("LinkedIn", "") else ""
        github_hyperlink = f'=HYPERLINK("{sections.get("GitHub", "")}", "GitHub")' if sections.get("GitHub", "") else ""

        row = [
            sections.get("Name", ""),
            sections.get("Email", ""),
            phone_number,
            linkedin_hyperlink,  # Insert LinkedIn as a hyperlink
            github_hyperlink,    # Insert GitHub as a hyperlink
            sections.get("Summary", ""),
            sections.get("Education", ""),
            sections.get("Profession", ""),
            sections.get("Skills", "")
        ]
        rows.append(row)

    df = pd.DataFrame(rows, columns=headers)

    # Create a new Google Sheet and add headers and rows
    sheet = client.create('Resume Extracted Sections')
    worksheet = sheet.get_worksheet(0)

    # Write headers and rows to the Google Sheet
    set_with_dataframe(worksheet, df)  # Use set_with_dataframe to handle formulas correctly



In [196]:
# 7. Execute the process
process_resumes_and_write_to_sheet(resume_templates)
