### <span style="color:orange">Task 1

Create a descriptive csv file from the plain-text samples with the followoing columns:

- POSITION_TITLE: Job title.
- resume number: the Id of resume that it was saved as
- QUALIFICATIONS: The qualifications that each applicant had for the job
- EDUCATION _TYPE: type of education that the applicant had (master, bachelor, diploma….)
- SCHOOL_TYPE: The type of school the applicant study in (university, college, high school)
- EDUCATION_MAJOR: The education major you study in this school.
- EXPERIENCE_LENGTH: The number of experience years needed to apply for this job.
- AWORDS: If the applicant had an award or not.
- COMMUNITY SERVICE: If the applicant had a community service work or not.
- VOLUNTEERING: If the applicant had any volunteering or not.
- DRIVERS_LICENSE_AVAILABILITY: Is driver license available or not for this applicant.
- SKILLS_COUNT: number of skills that the applicant had.
- SKILLS: the skill that applicant had.
- Languages: the number of languages for each applicant.

In [2]:
import pandas as pd
import re

In [2]:
def extract_position_title(resume_text):
"""
Extracts the position title from the given resume text.

Args:
    resume_text (str): The resume text to extract the position title from.
    
Returns:
    str: The extracted position title or 'No info' if not found.
"""
    match = re.search(r'(\S+\s\S+)', resume_text, re.IGNORECASE)
    return match.group(1) if match else 'No info'

In [3]:
def extract_qualifications(resume_text):
"""
Extracts the qualifications or highlights section from the given resume text.

Args:
    resume_text (str): The resume text to extract the qualifications from.
    
Returns:
    list: A list of extracted qualifications or ['no info'] if not found.
"""
    qualifications = re.findall(r"Qualifications? \D+|Highlights?\D+", resume_text, re.IGNORECASE)
    return(["no info" if not qualifications else re.sub(r'Qualifications?|Highlights', "", qual , re.IGNORECASE).strip()  for  qual in qualifications])

In [4]:
def extract_education_type(resume_text):
"""
Determines the highest education type achieved based on the given resume text.

Args:
    resume_text (str): The resume text to extract the education type from.
    
Returns:
    str: The identified education type or 'No info' if not found.
"""
    patterns_education_types = {
        'PhD': r'\b(ph\.?d\.?)\b',
        'Doctor': r'\b(doctor|doctore)\b',
        'Master': r'\b(master|master\'s|masters|MBA|MS|MSc|MA)\b',
        'Bachelor': r'\b(bachelor|bachelors|bachelor\.s|BA|B\.A|BS|B\.S|BSc|BAS|BSBA|BBA|BSBM|Sign\s*Language)\b',
        'Diploma': r'\b(diploma|Associate|AS|AA|College)\b',
        'High School': r'\b(high\sschool|GED)\b'
    }
    for degree, pattern in patterns_education_types.items():
        if re.search(pattern, resume_text, re.IGNORECASE):
            return degree
    return 'No info'

In [5]:
def extract_school_type(resume_text):
"""
Extracts the school type from the given resume text.

Args:
    resume_text (str): The resume text to extract the school type from.
    
Returns:
    str: The extracted school type ('university', 'college', 'high school') or 'No info' if not found.
"""
    return re.search(r"\b(university|college|high school)\b", resume_text, re.IGNORECASE).group() if re.search(r"\b(university|college|high school)\b", resume_text, re.IGNORECASE) else 'No info'

In [6]:
def extract_education_major(resume_text):
"""
Extracts the education major or concentration from the given resume text.

Args:
    resume_text (str): The resume text to extract the education major from.
    
Returns:
    list: A list of extracted education majors or 'No info' if not found.
"""
    major_matches = re.findall(r"Education\s*(?:Major|Concentration|Degree)\s*:\s*(.*)", resume_text, re.IGNORECASE)
    return [major.strip() for major in major_matches] if major_matches else 'No info'

In [7]:
def extract_experience_length(resume_text):
"""
  Extracts the total years of experience from the given resume text.
  
  Args:
      resume_text (str): The resume text to extract the experience length from.
      
  Returns:
      list: A list of extracted experience lengths in years.
  """
  extracted_experience=[]

  match = re.findall(r"(\d{1,2})\W?\s?years?|(\d{1,2})\s?plus\s?years?", resume_text)
  if  match:
    extracted_experience.append(re.findall(r"(\d{1,2})\W?\s?years?|(\d{1,2})\s?plus\s?years?", resume_text))
  else :
    extracted_experience.append(re.findall(r"Experience\s+\d{1,2}/(\d{4})\s+to\s+(Current)\b|\d{1,2}/(\d{4})\s+to\s+\d{1,2}/(\d{4})",resume_text))


  updated = [
    [
        tuple(['2023' if val == 'Current' else val for val in sub_tuple])
        if isinstance(sub_tuple, tuple)
        else sub_tuple
        for sub_tuple in sublist
    ]
    for sublist in extracted_experience
  ]


  experience_years = []
  for resume in updated:
    years = []
    for item in resume:
      if isinstance(item, tuple):
        if len(item)==4:
          start_year = item[0] if item[0] != '' else item[2]
          end_year = item[1] if item[1] != '' else item[3]
          if start_year and end_year:
            years.append(int(end_year) - int(start_year))
        else :
          start_year = item[0] if item[0] != '' else item[1]
          years.append(int(start_year))
    if years:
      experience_years.append(sum(years))
  return experience_years

In [8]:
def extract_awards(resume_text):
"""
Extracts the awards section from the given resume text.

Args:
    resume_text (str): The resume text to extract the awards from.
    
Returns:
    list: A list of extracted awards or ['no info'] if not found.
"""
    awards = re.findall(r"AWARDS?\n?.+", resume_text, re.IGNORECASE)
    return ["no info" if not awards else re.sub(r"AWORDS?[,]|awarded[,]|Awards[,]|awards[,]|Awarded|Awards", "", award, re.IGNORECASE) for award in awards]

In [9]:
def extract_community_service(resume_text):
"""
Determines if the given resume text mentions community service or volunteering.

Args:
    resume_text (str): The resume text to check for community service or volunteering.
    
Returns:
    str: 'Yes' if community service or volunteering is mentioned, 'No' otherwise.
"""
    return "Yes" if re.search(r"\b(community service|volunteer|volunteering)\b", resume_text, re.IGNORECASE) else "No"

In [10]:
def extract_volunteering(resume_text):
"""
Determines if the given resume text mentions volunteering.

Args:
    resume_text (str): The resume text to check for volunteering.
    
Returns:
    str: 'Yes' if volunteering is mentioned, 'No' otherwise.
"""
    return "Yes" if re.search(r"\b(volunteer|volunteering)\b", resume_text, re.IGNORECASE) else "No"

In [11]:
def extract_drivers_license_availability(resume_text):
"""
Determines if the given resume text mentions a driver's license or driving license.

Args:
    resume_text (str): The resume text to check for driver's license or driving license.
    
Returns:
    str: 'Yes' if a driver's license or driving license is mentioned, 'No' otherwise.
"""
    return "Yes" if re.search(r"\b(driver'?s? license|driving license)\b", resume_text, re.IGNORECASE) else "No"

In [12]:
def extract_skills(resume_text):
"""
Extracts the skills section from the given resume text.

Args:
    resume_text (str): The resume text to extract the skills from.
    
Returns:
    list: A list of extracted skills or ['no info'] if not found.
"""
    skills = re.findall(r"\bSkills\b:?\s*([\D.]+)", resume_text, flags=re.IGNORECASE)
    return skills if skills else ['no info']

In [13]:
def get_skills_count(skills_text):
"""
Calculates the count of skills extracted from the resume text.

Args:
    skills_text (list): The list of skills extracted from the resume.
    
Returns:
    int: The count of skills.
"""
    return len(skills_text[0].split(', ')) if skills_text[0] != 'no info' else 0

In [14]:
def extract_languages(resume_text):
"""
Extracts the languages mentioned in the given resume text.

Args:
    resume_text (str): The resume text to extract the languages from.
    
Returns:
    str: A comma-separated string of languages found or 'English' if not found.
"""
    languages = re.findall(r'English|Spanish|French|German|Mandarin Chinese|Arabic|Russian|Portuguese|Japanese|Italian', resume_text, flags=re.IGNORECASE)
    return ", ".join(languages) if languages else 'English'

In [15]:
def extract_resume_info(resume_text, resume_number):
"""
Extracts various resume information from the given resume text.

Args:
    resume_text (str): The resume text to extract the information from.
    resume_number (int): The identifier of the resume.
    
Returns:
    dict: A dictionary containing the extracted resume information.
        The dictionary has the following fields:
        - 'POSITION_TITLE': Position title extracted from the resume.
        - 'resume number': Resume identifier.
        - 'QUALIFICATIONS': Qualifications or highlights extracted from the resume.
        - 'EDUCATION_TYPE': Highest education type achieved.
        - 'SCHOOL_TYPE': School type (university, college, or high school).
        - 'EDUCATION_MAJOR': Education major or concentration.
        - 'EXPERIENCE_LENGTH': Total years of experience.
        - 'AWARDS': Awards received.
        - 'COMMUNITY SERVICE': Community service involvement (Yes/No).
        - 'VOLUNTEERING': Volunteering experience (Yes/No).
        - 'DRIVERS_LICENSE_AVAILABILITY': Driver's license availability (Yes/No).
        - 'SKILLS_COUNT': Count of skills mentioned in the resume.
        - 'SKILLS': List of skills extracted from the resume.
        - 'Languages': Languages mentioned in the resume.
"""
    position_title = extract_position_title(resume_text)
    qualifications = extract_qualifications(resume_text)
    education_type = extract_education_type(resume_text)
    school_type = extract_school_type(resume_text)
    education_major = extract_education_major(resume_text)
    experience_length = extract_experience_length(resume_text)
    awards = extract_awards(resume_text)
    community_service = extract_community_service(resume_text)
    volunteering = extract_volunteering(resume_text)
    drivers_license = extract_drivers_license_availability(resume_text)
    skills = extract_skills(resume_text)
    skills_count = get_skills_count(skills)
    languages_count = extract_languages(resume_text)

    return {
        'POSITION_TITLE': position_title,
        'resume number': resume_number,
        'QUALIFICATIONS': qualifications,
        'EDUCATION_TYPE': education_type,
        'SCHOOL_TYPE': school_type,
        'EDUCATION_MAJOR': education_major,
        'EXPERIENCE_LENGTH': experience_length,
        'AWARDS': awards,
        'COMMUNITY SERVICE': community_service,
        'VOLUNTEERING': volunteering,
        'DRIVERS_LICENSE_AVAILABILITY': drivers_license,
        'SKILLS_COUNT': skills_count,
        'SKILLS': skills,
        'Languages': languages_count
    }

In [17]:
def main():
"""
The main function that reads the resumes, extracts the resume information,
and saves the results to a CSV file.
"""
    # Read the resumes from the CSV file
    resumes_df = pd.read_csv('resume updated.csv')

    # Extract resume information for each resume and create a list of dictionaries
    resume_info_list = [extract_resume_info(resume_text, resume_number) for resume_text, resume_number in zip(resumes_df['Resume_str'], resumes_df['ID'])]

    # Create a new DataFrame from the list of resume information
    resume_info_df = pd.DataFrame(resume_info_list)

    # Save the DataFrame to a new CSV file
    resume_info_df.to_csv('extracted_resume_info.csv', index=False)

if __name__ == "__main__":
    main()

In [3]:
# Read the first CSV file
df1 = pd.read_csv('resume updated.csv')
df1
# Extract the column from the first DataFrame
column_to_add = df1['Category']

df = pd.read_csv('extracted_resume_info.csv')

# Add the column to the second DataFrame
df['CATEGORY'] = column_to_add
# Save the updated DataFrame to a new CSV file
df.to_csv('Ex_updated.csv', index=False)
