In [1]:
!pip install transformers
!pip install torch
!pip install nltk
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install pandas
!pip install pymupdf

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import fitz  # PyMuPDF
import os

# Define el directorio de los CVs
cv_dir = "/content/drive/MyDrive/IA/CVs"

# Lista todos los archivos PDF en el directorio
cv_files = [os.path.join(cv_dir, file) for file in os.listdir(cv_dir) if file.endswith('.pdf')]

def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as pdf:
        for page_num in range(len(pdf)):
            page = pdf.load_page(page_num)
            text += page.get_text()
    return text

# Extrae texto de todos los archivos PDF
cv_texts = [(file, extract_text_from_pdf(file)) for file in cv_files]

In [5]:
import spacy

nlp_spacy = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp_spacy(text)
    return " ".join([token.text for token in doc if not token.is_stop])

In [3]:
cv_files

['/content/drive/MyDrive/IA/CVs/1724171279_CV5.pdf',
 '/content/drive/MyDrive/IA/CVs/1724171599_CV1.pdf',
 '/content/drive/MyDrive/IA/CVs/1724171234_CV3.pdf',
 '/content/drive/MyDrive/IA/CVs/1724171168_CV2.pdf',
 '/content/drive/MyDrive/IA/CVs/1724171333_CV6.pdf',
 '/content/drive/MyDrive/IA/CVs/1724171258_CV4.pdf',
 '/content/drive/MyDrive/IA/CVs/1724171378_CV8.pdf',
 '/content/drive/MyDrive/IA/CVs/1724171354_CV7.pdf',
 '/content/drive/MyDrive/IA/CVs/1724171407_CV9.pdf']

In [4]:
# Valores de referencia para la salida
# Cada diccionario debe tener una clave 'file' que coincida con el nombre del archivo del CV
ground_truth = [
    {
        "file": cv_files[0],
        "name": "IMMANUEL ABRAHAM MAHARDHIKA",
        "contact": ["+62 8577 7124 773", "dhikayudano@gmail.com"],
        "experience": 7,
        "ai_education": "N"
    },
    {
        "file": cv_files[1],
        "name": "M O H P U J I J U N A E D I",
        "contact": ["+6282 3 3 147 2499", "m e@j u n a e . id"],
        "experience": 6,
        "ai_education": "S"
    },
    {
        "file": cv_files[2],
        "name": "Michael Smith",
        "contact": ["", "indeed.com/r/falicent/140749dace5dc26f"],
        "experience": 10,
        "ai_education": "S"
    },
    {
        "file": cv_files[3],
        "name": "POWELL FINWOOD",
        "contact": ["+123-456-7890", "linkedin.com/in/name"],
        "experience": 3,
        "ai_education": "N"
    },
    {
        "file": cv_files[4],
        "name": "Alice Clark",
        "contact": None,
        "experience": 20,
        "ai_education": "S"
    },
    {
        "file": cv_files[5],
        "name": "Dyah Hediyati S.Kom",
        "contact": ["+62 85287404232", "dyahhediyati@gmail.com"],
        "experience": 10,
        "ai_education": "N"
    },
    {
        "file": cv_files[6],
        "name": "DR.SANTOSH KAKADE",
        "contact": ["09850671175", "drsantoshkakade@gmail.com"],
        "experience": 20,
        "ai_education": "N"
    },
    {
        "file": cv_files[7],
        "name": "Ringgi Cahyo Dwiputra",
        "contact": ["085157115062", "ringgicahyo@gmail.com"],
        "experience": 4,
        "ai_education": "N"
    },
    {
        "file": cv_files[8],
        "name": "Loren Shevitz",
        "contact": ["773-665-1234", "loren@shevitz.org"],
        "experience": 21,
        "ai_education": "N"
    }
]

In [6]:
import re
import json
from transformers import pipeline
import spacy

# Carga modelos NLP
nlp_ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english")
nlp_spacy = spacy.load("en_core_web_sm")


def find_ground_truth(file_name):
    for gt in ground_truth:
        if gt["file"] == file_name:
            return gt
    return {}

def calculate_score(predicted, actual):
    if not isinstance(actual, list):
      actual = [actual]
    elif actual is None:
      return 0.0

    if predicted in actual:
        return 1.0

    return 0.0

def extract_name(text):
    entities = nlp_ner(text)
    names = [entity['word'] for entity in entities if entity['entity'] == 'B-PER']
    return " ".join(names) if names else None

def extract_contact(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    phone_pattern = r'\b\d{10,15}\b'
    email = re.search(email_pattern, text)
    phone = re.search(phone_pattern, text)
    return email.group() if email else phone.group() if phone else None

def extract_experience(text):
    experience_keywords = ["years of experience", "experience", "years"]
    experience = None
    for keyword in experience_keywords:
        match = re.search(r'\d+', text)
        if match:
            experience = int(match.group())
            break
    return experience

def extract_ai_education(text):
    ai_keywords = ["artificial intelligence", "machine learning", "deep learning", "AI"]
    text_lower = text.lower()
    return "S" if any(keyword.lower() in text_lower for keyword in ai_keywords) else "N"

def create_result_json(file, text):
    gt = find_ground_truth(file)
    name = extract_name(text)
    contact = extract_contact(text)
    experience = extract_experience(text)
    ai_education = extract_ai_education(text)

    result = {
        "name": name,
        "contact": contact,
        "experience": experience,
        "ai_education": ai_education,
        "name_score": calculate_score(name, gt.get('name')),
        "contact_score": calculate_score(contact, gt.get('contact')),
        "experience_score": calculate_score(experience, gt.get('experience')),
        "ai_education_score": calculate_score(ai_education, gt.get('ai_education'))
    }
    return json.dumps(result, indent=4)

# Aplicar las funciones a los textos extraídos de los CVs
results = [(file, create_result_json(file, text)) for file, text in cv_texts]

# Mostrar resultados (opcional)
for result in results:
    print("########################################")
    print(result[0] + ":")
    print(result[1])
    print()
    print("########################################")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


########################################
/content/drive/MyDrive/IA/CVs/1724171279_CV5.pdf:
{
    "name": null,
    "contact": "dhikayudano@gmail.com",
    "experience": 11,
    "ai_education": "S",
    "name_score": 0.0,
    "contact_score": 1.0,
    "experience_score": 0.0,
    "ai_education_score": 0.0
}

########################################
########################################
/content/drive/MyDrive/IA/CVs/1724171599_CV1.pdf:
{
    "name": null,
    "contact": "6282331472499",
    "experience": 6,
    "ai_education": "S",
    "name_score": 0.0,
    "contact_score": 0.0,
    "experience_score": 1.0,
    "ai_education_score": 1.0
}

########################################
########################################
/content/drive/MyDrive/IA/CVs/1724171234_CV3.pdf:
{
    "name": null,
    "contact": null,
    "experience": 140749,
    "ai_education": "S",
    "name_score": 0.0,
    "contact_score": 0.0,
    "experience_score": 0.0,
    "ai_education_score": 1.0
}

###############