In [64]:
import pandas as pd
import numpy as np
import re
import ast
from datetime import datetime

In [65]:
df = pd.read_csv("../dataset/cleaned_resume_data.csv")
df.head()

Unnamed: 0,address,career_objective,skills,educational_institution_name,degree_names,passing_years,educational_results,result_types,major_field_of_studies,professional_company_names,...,online_links,issue_dates,expiry_dates,job_position_name,educationaL_requirements,experiencere_requirement,age_requirement,responsibilities.1,skills_required,matched_score
0,[],['big data analytics working and database ware...,"['big data', 'hadoop', 'hive', 'python', 'mapr...",['the amity school of engineering technology a...,['b tech'],['2019'],[],[],['electronics'],['coca cola'],...,[],[],[],['senior software engineer'],['b sc in computer science engineering from a ...,['at least 1 year'],[],['technical support troubleshooting collaborat...,[],0.85
1,[],['fresher looking to join as a data analyst an...,"['data analysis', 'data analytics', 'business ...","['delhi university hansraj college', 'delhi un...","['b sc maths', 'm sc science statistics']","['2015', '2018']",[],[],"['mathematics', 'statistics']",['bib consultancy'],...,[],[],[],['machine learning ml engineer'],['m sc in computer science engineering or in a...,['at least 5 year s'],[],['machine learning leadership cross functional...,[],0.75
2,[],[],"['software development', 'machine learning', '...",['birla institute of technology bit ranchi'],['b tech'],['2018'],[],[],['electronics telecommunication'],['axis bank limited'],...,[],[],[],['executive senior executive trade marketing h...,['master of business administration mba'],['at least 3 years'],[],['trade marketing executive brand visibility s...,['brand promotion campaign management field su...,0.416667
3,[],['to obtain a position in a fast paced busines...,"['accounts payables', 'accounts receivables', ...",['martinez adult education business training c...,['computer applications specialist certificate...,['2008'],[],[],['computer applications'],"['company name city state', 'company name city...",...,[],[],[],['business development executive'],['bachelor honors'],['1 to 3 years'],['age 22 to 30 years'],['apparel sourcing quality garment sourcing re...,['fast typing skill ieltsinternet browsing onl...,0.76
4,[],['professional accountant with an outstanding ...,"['analytical reasoning', 'compliance testing k...",['kent state university'],['bachelor of business administration'],[],['3 84'],[],['accounting'],"['company name', 'company name', 'company name...",...,[],[],['february 15 2021'],['senior ios engineer'],['bachelor of science bsc in computer science'],['at least 4 years'],[],['ios lifecycle requirement analysis native fr...,['ios ios app developer ios application develo...,0.65


In [66]:
list_columns = [
    'skills', 'degree_names', 'passing_years', 'major_field_of_studies',
    'professional_company_names', 'start_dates', 'end_dates',
    'related_skils_in_job', 'positions', 'responsibilities',
    'extra_curricular_activity_types', 'extra_curricular_organization_names',
    'role_positions', 'languages', 'proficiency_levels',
    'certification_providers', 'certification_skills',
    'job_position_name', 'educationaL_requirements',
    'experiencere_requirement', 'age_requirement',
    'responsibilities.1', 'skills_required'
]

def to_list(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return []
    return []

for col in list_columns:
    if col in df.columns:
        df[col] = df[col].apply(to_list)


In [67]:
CURRENT_YEAR = datetime.now().year

def safe_len(x):
    return len(x) if isinstance(x, list) else 0

def extract_years(year_list):
    if isinstance(year_list, list) and len(year_list) >= 1:
        years = [int(y) for y in year_list if str(y).isdigit()]
        if len(years) >= 2:
            return max(years) - min(years)
    return 0

def extract_start_year(date_list):
    if isinstance(date_list, list) and len(date_list) > 0:
        match = re.search(r"\d{4}", date_list[0])
        if match:
            return int(match.group())
    return None

def extract_experience(start_dates, end_dates):
    start_year = extract_start_year(start_dates)
    if start_year is None:
        return 0
    return CURRENT_YEAR - start_year

def degree_level_mapper(degrees):
    if not isinstance(degrees, list):
        return 0
    text = " ".join(degrees).lower()
    if "phd" in text:
        return 3
    if "master" in text or "m sc" in text or "mba" in text:
        return 2
    if "b tech" in text or "bachelor" in text:
        return 1
    return 0

def extract_min_experience(exp_req):
    if isinstance(exp_req, list) and len(exp_req) > 0:
        match = re.search(r"\d+", exp_req[0])
        if match:
            return int(match.group())
    return 0

def education_match_flag(degrees, edu_req):
    if not isinstance(degrees, list) or not isinstance(edu_req, list):
        return 0
    deg_text = " ".join(degrees).lower()
    req_text = " ".join(edu_req).lower()
    return int(any(word in req_text for word in deg_text.split()))


In [68]:
features = []

for _, row in df.iterrows():
    skills = row['skills']
    skills_required = row['skills_required']

    years_exp = extract_experience(row['start_dates'], row['end_dates'])
    min_exp = extract_min_experience(row['experiencere_requirement'])

    feature_row = [
        safe_len(skills),                                  # num_skills
        safe_len(skills_required),                         # required_skill_count
        len(set(skills) & set(skills_required)) if isinstance(skills, list) and isinstance(skills_required, list) else 0,
        years_exp,                                         # years_of_experience
        years_exp - min_exp,                               # experience_gap
        extract_years(row['passing_years']),               # education_duration_years
        degree_level_mapper(row['degree_names']),          # degree_level
        safe_len(row['certification_skills']),             # num_certifications
        safe_len(row['languages']),                         # num_languages
        safe_len(row['professional_company_names']),       # num_companies
        safe_len(row['responsibilities']),                 # num_responsibilities
        safe_len(row['responsibilities.1']),               # job_responsibility_count
        education_match_flag(row['degree_names'], row['educationaL_requirements']),
        1 if isinstance(row['age_requirement'], list) and len(row['age_requirement']) > 0 else 0
    ]

    features.append(feature_row)


In [69]:
columns_new = [
    "num_skills",
    "required_skill_count",
    "skills_matching",
    "years_of_experience",
    "experience_gap",
    "education_duration_years",
    "degree_level",
    "num_certifications",
    "num_languages",
    "num_companies",
    "num_responsibilities",
    "job_responsibility_count",
    "education_match",
    "age_requirement_met"
]

In [70]:
df_final = pd.DataFrame(features,columns=columns_new)
df_final["matched_score"] = df['matched_score'].values

In [71]:
df_final.head(2)

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met,matched_score
0,21,0,0,7,6,0,1,0,0,1,1,1,1,0,0.85
1,10,0,0,7,2,3,2,0,0,1,1,1,1,0,0.75


In [72]:
df_final.info()

<class 'pandas.DataFrame'>
RangeIndex: 9460 entries, 0 to 9459
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   num_skills                9460 non-null   int64  
 1   required_skill_count      9460 non-null   int64  
 2   skills_matching           9460 non-null   int64  
 3   years_of_experience       9460 non-null   int64  
 4   experience_gap            9460 non-null   int64  
 5   education_duration_years  9460 non-null   int64  
 6   degree_level              9460 non-null   int64  
 7   num_certifications        9460 non-null   int64  
 8   num_languages             9460 non-null   int64  
 9   num_companies             9460 non-null   int64  
 10  num_responsibilities      9460 non-null   int64  
 11  job_responsibility_count  9460 non-null   int64  
 12  education_match           9460 non-null   int64  
 13  age_requirement_met       9460 non-null   int64  
 14  matched_score      

In [73]:
df_final.describe()

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met,matched_score
count,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0,9460.0
mean,21.310148,0.821776,0.000106,10.168499,7.132981,7.506977,0.923044,0.210148,0.165751,2.717759,1.0,1.0,0.824313,0.571776,0.660906
std,18.742461,0.382722,0.010281,5.348786,6.089448,104.792753,0.703395,1.259711,0.631494,1.96363,0.0,0.0,0.380574,0.494848,0.167003
min,0.0,0.0,0.0,0.0,-15.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,10.0,1.0,0.0,7.0,3.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.583333
50%,14.0,1.0,0.0,8.0,6.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.683333
75%,28.0,1.0,0.0,13.0,10.0,2.0,1.0,0.0,0.0,4.0,1.0,1.0,1.0,1.0,0.793333
max,144.0,1.0,1.0,32.0,32.0,1929.0,3.0,18.0,4.0,9.0,1.0,1.0,1.0,1.0,0.95


In [74]:
df_final.to_csv("../dataset/numerified_resume_data.csv", index=False)