# Extract Required Experience using Regex

## Load Data

In [1]:
import pandas as pd

fileName = 'Euraxess_GNSS_08-01-2024'
excel_file = '../Results/' + fileName + '.xlsx'

df = pd.read_excel(excel_file)

### Prepare Data

In [2]:
columns = ['Title', 'OfferDescription', 'Requirements', 'Responsibilities', 'AdditionalInformation']
existing_columns = [col for col in columns if col in df.columns]

requirements = df['Requirements'].tolist()
descriptions = df[existing_columns].copy().apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).tolist()

input = [
    req if isinstance(req, str) and req != '-' else desc
    for req, desc in zip(requirements, descriptions)
]

## Evaluate

In [3]:
import re

def extract_experience(text):
    # Mapping words to numbers for 0-20 and some common multiples of ten
    words_to_numbers = {
        'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7,
        'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14,
        'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, 'twenty': 20,
        'thirty': 30, 'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90
    }
    
    # Helper function to convert word to number
    def word_to_num(word):
        return words_to_numbers.get(word.lower(), None)
    
    # Define regex patterns for various ways of expressing years of experience
    number_or_word = r'\b(\d+|' + '|'.join(words_to_numbers.keys()) + r')\b'
    
    patterns = [
        fr'{number_or_word}\s*(?:years?|year)\s+of\s+experience\b',
        fr'at\s+least\s+{number_or_word}\s*(?:years?|year)\s+of\s+(?:relevant\s+)?work\s+experience\b',
        fr'over\s+{number_or_word}\s*(?:years?|year)\s+of\s+experience\b',
        fr'{number_or_word}\s*(?:years?|year)\s+experience\b',
        fr'{number_or_word}\s*(?:years?|year)\s+of\s+relevant\s+experience\b',
        fr'minimum\s+of\s+{number_or_word}\s*(?:years?|year)\s+experience\b',
        fr'{number_or_word}\s*(?:years?|year)\s+of\s+professional\s+experience\b',
        fr'{number_or_word}\s*(?:[-to]\s*){number_or_word}\s*(?:years?|year)\s+of\s+experience\b',
        fr'from\s+{number_or_word}\s+to\s+{number_or_word}\s*(?:years?|year)\s+of\s+experience\b',
        fr'between\s+{number_or_word}\s+and\s+{number_or_word}\s*(?:years?|year)\s+of\s+experience\b',
        fr'{number_or_word}\s*-\s*{number_or_word}\s*(?:years?|year)\s+of\s+experience\b',
        fr'{number_or_word}\s*plus\s*years?\s+of\s+experience\b',
        fr'at\s+least\s+{number_or_word}\s*(?:years?|year)\s+of\s+professional\s+experience\b',
        fr'up\s+to\s+{number_or_word}\s*(?:years?|year)\s+of\s+experience\b',
        fr'{number_or_word}\s*\+\s*years?\s+of\s+experience\b',
        fr'a\s+minimum\s+of\s+{number_or_word}\s*(?:years?|year)\s+of\s+experience\b',
        fr'from\s+{number_or_word}\s*(?:-\s*)?to\s+{number_or_word}\s*(?:years?|year)\s+experience\b',
        fr'from\s+{number_or_word}\s+to\s+{number_or_word}\s*(?:years?|year)\s+of\s+relevant\s+experience\b',
        fr'between\s+{number_or_word}\s+and\s+{number_or_word}\s*(?:years?|year)\s+of\s+relevant\s+experience\b',
        fr'{number_or_word}\s*-\s*{number_or_word}\s*(?:years?|year)\s+of\s+relevant\s+experience\b'
    ]
    
    # Combine patterns into one regex
    combined_pattern = '|'.join(f"({pattern})" for pattern in patterns)
    
    # Compile the regex
    regex = re.compile(combined_pattern, re.IGNORECASE)
    
    # Find all matches
    matches = regex.findall(text)
    
    # Extract the years from matches
    experience_years = -1
    for match in matches:
        for item in match:
            if item.isdigit() and (experience_years == -1 or int(item) < experience_years):
                experience_years = int(item)
            else:
                num = word_to_num(item)
                if num is not None and (experience_years == -1 or num < experience_years):
                    experience_years = num
        
    return experience_years

In [4]:
import datetime

requiredYears = []
classified = 0

for text in input:
    textExperience = extract_experience(text)
    if textExperience != -1: 
        classified += 1
        requiredYears.append(textExperience)
    else:
        requiredYears.append('Undefined')
        
df['RequiredExperience'] = requiredYears
df.to_excel('../Results/' + fileName + '.xlsx', index=False, engine='openpyxl')
        
percentage = (classified / len(requiredYears)) * 100
print(f"Classified: {percentage:.2f}%")
print(f"Undefined: {(100 - percentage):.2f}%")

print("\n ----------------------------- \n")
print("Document Annotated")

Classified: 2.35%
Undefined: 97.65%

 ----------------------------- 

Document Annotated
