# Шаг 1: Конвертация резюме в JSON

In [64]:
import os
import docx
import PyPDF2

In [65]:
def extract_text_from_pdf(path):
    read_pdf = PyPDF2.PdfReader(open(path, 'rb'))
    number_of_pages = len(read_pdf.pages)
    page_content = ''
    for i in range(number_of_pages):
        page = read_pdf.pages[i]
        page_content += page.extract_text()
    return page_content

def extract_text_from_doc(path):
    import win32com.client
    word = win32com.client.Dispatch("Word.Application")
    doc = word.Documents.Open(path)
    doc_content = doc.Range().Text
    doc.Close()
    word.Quit()
    return doc_content

def extract_text_from_docx(path):
    doc = docx.Document(path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def extract_text(path):
    base, ext = os.path.splitext(path)
    if ext == '.pdf':
        return extract_text_from_pdf(path)
    elif ext == '.docx':
        return extract_text_from_docx(path)
    elif ext == '.doc':
        return extract_text_from_doc(path)
    else:
        return 'Неизвестный формат файла'

# Шаг 2: Извлечение сущностей с использованием SpaCy

In [75]:
import spacy
import re
import json
import uuid

In [76]:
nlp = spacy.load('ru_core_news_sm')

In [77]:
def extract_entities(text):
    doc = nlp(text)
    sections = {
        "education": "",
        "experience": "",
        "skills": "",
        "contact": "",
        "summary": "",
        "languages": ""
    }

    patterns = {
        "education": r"(Образование|Education|EDUCATION)([\s\S]*?)(?=(Опыт работы|Experience|EXPERIENCE|Навыки|Skills|SKILLS|Контакты|Contact|CONTACT|Резюме|Summary|SUMMARY|О себе|About|Objective|OBJECTIVE|$))",
        "experience": r"(Опыт работы|Experience|EXPERIENCE)([\s\S]*?)(?=(Образование|Education|EDUCATION|Навыки|Skills|SKILLS|Контакты|Contact|CONTACT|Резюме|Summary|SUMMARY|О себе|About|Objective|OBJECTIVE|$))",
        "skills": r"(Навыки|Skills|SKILLS)([\s\S]*?)(?=(Образование|Education|EDUCATION|Опыт работы|Experience|EXPERIENCE|Контакты|Contact|CONTACT|Резюме|Summary|SUMMARY|О себе|About|Objective|OBJECTIVE|$))",
        "languages": r"(Языки|Languages|LANGUAGES)([\s\S]*?)(?=(Образование|Education|EDUCATION|Опыт работы|Experience|EXPERIENCE|Навыки|Skills|SKILLS|Контакты|Contact|CONTACT|Резюме|Summary|SUMMARY|О себе|About|Objective|OBJECTIVE|$))",
        "summary": r"(Резюме|Summary|SUMMARY|О себе|About|Objective|OBJECTIVE)([\s\S]*?)(?=(Образование|Education|EDUCATION|Опыт работы|Experience|EXPERIENCE|Навыки|Skills|SKILLS|Контакты|Contact|CONTACT|$))"
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            sections[key] = match.group(2).strip()

    contact_pattern = re.compile(r"(\+?\d[\d\s()-]{7,}\s*|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\s*|linkedin\.com/in/[^\s]+|github\.com/[^\s]+|e-mail:\s*[^\s]+)", re.IGNORECASE)
    sections["contact"] = "\n".join(contact_pattern.findall(text))

    entities = {
        "first_name": "",
        "last_name": "",
        "middle_name": "",
        "birth_date": "",
        "country": "",
        "city": ""
    }

    for ent in doc.ents:
        if ent.label_ == "PER":
            name_parts = ent.text.split()
            if len(name_parts) >= 2:
                entities["first_name"], entities["last_name"] = name_parts[0], name_parts[1]
            if len(name_parts) > 2:
                entities["middle_name"] = ' '.join(name_parts[2:])
        elif ent.label_ == "DATE":
            entities["birth_date"] = ent.text
        elif ent.label_ in ["GPE", "LOC"]:
            if "country" in entities and not entities["country"]:
                entities["country"] = ent.text
            elif "city" in entities and not entities["city"]:
                entities["city"] = ent.text

    return sections, entities

In [78]:
def create_json_structure(sections, entities):
    resume_data = {
        "resume": {
            "resume_id": str(uuid.uuid4()),
            "first_name": entities["first_name"],
            "last_name": entities["last_name"],
            "middle_name": entities["middle_name"],
            "birth_date": entities["birth_date"],
            "birth_date_year_only": False,
            "country": entities["country"],
            "city": entities["city"],
            "about": sections["summary"] if sections["summary"] else None,
            "key_skills": ' '.join([skill.strip() for skill in sections["skills"].split('\n') if skill.strip()]),
            "salary_expectations_amount": "",
            "salary_expectations_currency": "",
            "photo_path": "",
            "gender": "",
            "resume_name": "",
            "source_link": "",
            "contactItems": [
                {
                    "resume_contact_item_id": str(uuid.uuid4()),
                    "value": contact.strip(),
                    "comment": "",
                    "contact_type": "phone" if re.match(r"\+?\d[\d\s()-]{7,}", contact) else "email" if "@" in contact else "link"
                } for contact in sections["contact"].split('\n')
            ],
            "educationItems": [
                {
                    "resume_education_item_id": str(uuid.uuid4()),
                    "year": "",
                    "organization": "",
                    "faculty": "",
                    "specialty": "",
                    "result": "",
                    "education_type": "",
                    "education_level": ""
                }
            ],
            "experienceItems": [
                {
                    "resume_experience_item_id": str(uuid.uuid4()),
                    "starts": "",
                    "ends": "",
                    "employer": "",
                    "city": "",
                    "url": "",
                    "position": "",
                    "description": sections["experience"],
                    "order": ""
                }
            ],
            "languageItems": [
                {
                    "resume_language_item_id": str(uuid.uuid4()),
                    "language": lang.split(':')[0].strip(),
                    "language_level": lang.split(':')[1].strip() if ':' in lang else ""
                } for lang in sections["languages"].split('\n') if lang.strip()
            ]
        }
    }
    return resume_data

In [79]:
def convert_resume_to_json(path):
    text = extract_text(path)
    sections, entities = extract_entities(text)
    return create_json_structure(sections, entities)

In [81]:
resume_path = 'Examples\Резюме для 1 кейса Хакатона\Алексей Тугаенко.docx'  # Путь к файлу резюме
resume_json = convert_resume_to_json(resume_path)

# Сохранение JSON в файл
json_path = 'converted_resume.json'
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(resume_json, f, ensure_ascii=False, indent=4)

print("Конвертированное резюме сохранено в", json_path)

Конвертированное резюме сохранено в converted_resume.json
