In [1]:
import pandas as pd
import re

In [2]:
def clean_skill(skill):
    s = skill.strip().lower()
    s = re.sub(r'^[^a-zA-Z0-9#+]+|[^a-zA-Z0-9#+]+$', '', s)
    if re.match(r'^\d+$', s): return None
    if re.search(r'\b\d{2,4}\b', s): return None
    if len(s.split()) > 5: return None
    if len(s) < 2 or len(s) > 50: return None
    junk_patterns = ['experience', 'years', 'motivated', 'energetic', 'monitor', 'accounting & bookkeeping']
    if any(j in s for j in junk_patterns): return None
    if not re.match(r'^[a-z0-9 .#+/-]+$', s): return None
    return s

In [3]:
def extract_unique_skills(df, columns=['skills', 'related_skils_in_job']):
    list_of_skills = set()
    for col in columns:
        for x in df[col].dropna():
            text = str(x)
            text = re.sub(r'[\[\]\']', '', text)
            text = text.replace('"', '')
            for skill in text.split(','):
                skill = skill.strip().lower()
                if skill:
                    list_of_skills.add(skill)
    cleaned_skills = sorted(list(list_of_skills))
    return cleaned_skills

In [4]:
def clean_skill_list_with_groq(skills, token, llm):
    
    prompt = PromptTemplate.from_template("""
You are cleaning a list of technical and professional skills extracted from resumes.
Remove irrelevant text, years, long phrases, and soft-skill noise.
Return only concise, comma-separated skill names (e.g., python, sql, machine learning).

Example:
Input: ["15 years of accounting experience", "python", "ms excel", "energetic personality"]
Output: python, ms excel

Input:
{skills}
""")
    chain = LLMChain(llm=llm, prompt=prompt)
    cleaned_results = []

    batch_size = 40
    for i in range(0, len(skills), batch_size):
        batch = skills[i:i+batch_size]
        result = chain.run(skills=batch)
        result = result.replace('\n', '').strip()
        batch_skills = [s.strip().lower() for s in result.split(',') if s.strip()]
        cleaned_results.extend(batch_skills)

    cleaned_results = sorted(set(cleaned_results))
    return cleaned_results

In [5]:
df = pd.read_csv('resume_data.csv')
df

Unnamed: 0,address,career_objective,skills,educational_institution_name,degree_names,passing_years,educational_results,result_types,major_field_of_studies,professional_company_names,...,online_links,issue_dates,expiry_dates,job_position_name,educationaL_requirements,experiencere_requirement,age_requirement,responsibilities.1,skills_required,matched_score
0,,Big data analytics working and database wareho...,"['Big Data', 'Hadoop', 'Hive', 'Python', 'Mapr...",['The Amity School of Engineering & Technology...,['B.Tech'],['2019'],['N/A'],[None],['Electronics'],['Coca-COla'],...,,,,Senior Software Engineer,B.Sc in Computer Science & Engineering from a ...,At least 1 year,,Technical Support\nTroubleshooting\nCollaborat...,,0.850000
1,,Fresher looking to join as a data analyst and ...,"['Data Analysis', 'Data Analytics', 'Business ...","['Delhi University - Hansraj College', 'Delhi ...","['B.Sc (Maths)', 'M.Sc (Science) (Statistics)']","['2015', '2018']","['N/A', 'N/A']","['N/A', 'N/A']","['Mathematics', 'Statistics']",['BIB Consultancy'],...,,,,Machine Learning (ML) Engineer,M.Sc in Computer Science & Engineering or in a...,At least 5 year(s),,Machine Learning Leadership\nCross-Functional ...,,0.750000
2,,,"['Software Development', 'Machine Learning', '...","['Birla Institute of Technology (BIT), Ranchi']",['B.Tech'],['2018'],['N/A'],['N/A'],['Electronics/Telecommunication'],['Axis Bank Limited'],...,,,,"Executive/ Senior Executive- Trade Marketing, ...",Master of Business Administration (MBA),At least 3 years,,"Trade Marketing Executive\nBrand Visibility, S...",Brand Promotion\nCampaign Management\nField Su...,0.416667
3,,To obtain a position in a fast-paced business ...,"['accounts payables', 'accounts receivables', ...","['Martinez Adult Education, Business Training ...",['Computer Applications Specialist Certificate...,['2008'],[None],[None],['Computer Applications'],"['Company Name ï¼ City , State', 'Company Name...",...,,,,Business Development Executive,Bachelor/Honors,1 to 3 years,Age 22 to 30 years,Apparel Sourcing\nQuality Garment Sourcing\nRe...,Fast typing skill\nIELTSInternet browsing & on...,0.760000
4,,Professional accountant with an outstanding wo...,"['Analytical reasoning', 'Compliance testing k...",['Kent State University'],['Bachelor of Business Administration'],[None],['3.84'],[None],['Accounting'],"['Company Name', 'Company Name', 'Company Name...",...,[None],[None],"['February 15, 2021']",Senior iOS Engineer,Bachelor of Science (BSc) in Computer Science,At least 4 years,,iOS Lifecycle\nRequirement Analysis\nNative Fr...,iOS\niOS App Developer\niOS Application Develo...,0.650000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9539,,,"['Mathematical modelling', 'Machine Learning',...",['Sanghvi College of Engineering'],['B.Tech'],['2019'],['N/A'],['N/A'],['N/A'],['BPM Foundation'],...,,,,Data Engineer,Bachelor of Science (BSc),5 to 8 years,,Data Platform Design\nData Pipeline Developmen...,Azure\nBig Data\nData Analytics\nETL Tools\nPo...,0.683333
9540,,Expertise EDA modeler. I like to learn what my...,"['Data Analysis', 'Business Analysis', 'Machin...","['KVoCT, Pune', 'KVoCT, Pune']","['B.CA', 'M.CA']","['2018', '2020']","[None, None]","[None, None]","[None, None]",['Passionate Solution'],...,,,,Executive/ Sr. Executive -IT,Bachelor of Science (BSc) in Computer Science ...,3 to 5 years,Age at most 40 years,Hardware & Software Installation\nSystem Monit...,,0.650000
9541,,Looking for roles related to application devel...,"['Business Analyst', 'Data Analytics', 'Data C...",['PGG College Mysore'],['B.BA'],['2019'],['N/A'],['N/A'],['N/A'],['ZigSAW'],...,,,,Executive - VAT,BBA in Accounting and Finance,1 to 3 years,,Mushak Forms Maintenance\nVAT Software & MS Of...,VAT and Tax,0.650000
9542,,,"['Machine Learning', 'Natural Language Process...","['Rajiv Gandhi Memorial University, Delhi']",['B.TECH'],['2020'],['N/A'],['N/A'],['Electrical'],['Zynta Labs'],...,[None],[None],[None],Asst. Manager/ Manger (Administrative),Bachelor/Honors,At least 5 years,Age at least 28 years,Administrative Support\nScheduling\nFiling & D...,•Administration\n•Health Safety and Environmen...,0.650000


In [6]:
df.columns

Index(['address', 'career_objective', 'skills', 'educational_institution_name',
       'degree_names', 'passing_years', 'educational_results', 'result_types',
       'major_field_of_studies', 'professional_company_names', 'company_urls',
       'start_dates', 'end_dates', 'related_skils_in_job', 'positions',
       'locations', 'responsibilities', 'extra_curricular_activity_types',
       'extra_curricular_organization_names',
       'extra_curricular_organization_links', 'role_positions', 'languages',
       'proficiency_levels', 'certification_providers', 'certification_skills',
       'online_links', 'issue_dates', 'expiry_dates', 'job_position_name',
       'educationaL_requirements', 'experiencere_requirement',
       'age_requirement', 'responsibilities.1', 'skills_required',
       'matched_score'],
      dtype='object')

In [10]:
cleaned_skills = extract_unique_skills(df)

filtered_skills = []
for s in cleaned_skills:
    cleaned = clean_skill(s)
    if cleaned:
        filtered_skills.append(cleaned)

print("Size of the filtered Skill ->", len(filtered_skills))
filtered_skills = sorted(set(filtered_skills))

for name in filtered_skills:
    print(name, sep = '\n')

Size of the filtered Skill -> 3833
10key by touch
2-tier
2k
3-d noise mapping
3d
3d modeling
3d printing
3ds max
3dsmax
401k rollovers
5 why
5s
5s expert
5s methodologies
5s technique
8d
a+
a/p
a/r
aaa security model
aardwolf
abaqus
abila mip fund accounting software
ability of managing multi tasks
ability to prioritize
ability to translate technical specifications
academic
access
access control
access database
access database queries
account analysis
account management
account management and development
account reconciliation
account reconciliation expert
account reconciliation processes
account reconciliations
accountable for
accountant
accounting
accounting adjustments
accounting controls
accounting department management
accounting documents
accounting duties
accounting manager
accounting operations
accounting operations professional
accounting oversight
accounting packages
accounting policies
accounting procedures
accounting procedures research
accounting processes
accounting softw