# Business objective :-
The document classification solution should significantly reduce the manual human effort in the HRM. It should achieve a higher level of accuracy and automation with minimal human intervention

# 1. Extracting Text from Resumes

In [37]:
# Importing Libraries
import os
import re
import textract
import pandas as pd
import nltk
import spacy

In [39]:
import os
import textract
import pandas as pd
import re

main_folder = '../Resumes/'

raw_data = []
categories = []

for subfolder in os.listdir(main_folder):
    folder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(folder_path):
        category = subfolder  
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.docx'):  
                file_path = os.path.join(folder_path, file_name)
                try:
                    raw_text = textract.process(file_path).decode('utf-8')
                    
                    clean_text = re.sub(r'[^a-zA-Z\s]', '', raw_text)  
                    clean_text = re.sub(r'\s+', ' ', clean_text).strip().lower() 
                    
                    raw_data.append(clean_text)
                    categories.append(category)
                except Exception as e:
                    print(f"Error processing file {file_name} in {folder_path}: {e}")

resume_data = pd.DataFrame({'Raw_Details': raw_data, 'Category': categories})
resume_data

Unnamed: 0,Raw_Details,Category
0,anubhav kumar singh core competencies scriptin...,PeopleSoft Resumes
1,g ananda rayudu httpswwwlinkedincominanandguda...,PeopleSoft Resumes
2,peoplesoft database administrator gangareddy p...,PeopleSoft Resumes
3,classification internal classification interna...,PeopleSoft Resumes
4,priyanka ramadoss mountpleasant coonoor the ni...,PeopleSoft Resumes
...,...,...
74,workday integration consultant name sri krishn...,WorkDay Resumes
75,srikanth workday hcm consultant seeking suitab...,WorkDay Resumes
76,workday hcm fcm name kumar ss role workday con...,WorkDay Resumes
77,venkateswarlub workday consultant professional...,WorkDay Resumes


# Extracting Names 

In [41]:
name_keywords = [
    "Peoplesoft Admin", "PeopleSoft DBA", "Peoplesoft Finance", "Peoplesoft FSCM",
    "Resume", "Hexaware", "Internship", "Musquare Technologies", "React Developer", "Reactjs Developer", 
    "React JS Developer", "React Dev", "Hexaware", "converted"
]

def extract_name_from_filename(file_name):
    base_name = os.path.splitext(file_name)[0]
    base_name = re.sub(r'\[.*\]', '', base_name)

    for keyword in name_keywords:
        if keyword in base_name:
            base_name = base_name.replace(keyword, '')

    parts = base_name.split('_')

    if len(parts) > 1:
        name_part = ' '.join(part.strip() for part in parts if part.strip())
    else:
        name_part = base_name.strip()

    name_part = re.sub(r'[^a-zA-Z\s]', '', name_part).strip()
    return name_part.title() if name_part else None

In [9]:
main_folder = '../Resumes/'

raw_data = []
extracted_names = []

for subfolder in os.listdir(main_folder):
    folder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.docx') or file_name.endswith('.pdf'):
                raw_data.append(file_name)
                extracted_name = extract_name_from_filename(file_name)
                extracted_names.append(extracted_name)

extracted_data = pd.DataFrame({
    'File_Name': raw_data,
    'Extracted_Name': extracted_names
})

extracted_data

Unnamed: 0,File_Name,Extracted_Name
0,Peoplesoft Admin_AnubhavSingh.docx,Anubhavsingh
1,Peoplesoft Admin_G Ananda Rayudu.docx,G Ananda Rayudu
2,Peoplesoft Admin_Gangareddy.docx,Gangareddy
3,Peoplesoft Admin_Murali.docx,Murali
4,Peoplesoft Admin_Priyanka Ramadoss.docx,Priyanka Ramadoss
...,...,...
74,Sri Krishna S_Hexaware.docx,Sri Krishna S
75,Srikanth-Hexaware.docx,Srikanth
76,SSKumar_Hexaware.docx,Sskumar
77,Venkateswarlu B_Hexaware.docx,Venkateswarlu B


In [43]:
resume_data = pd.concat([resume_data, extracted_data[['Extracted_Name']]], axis=1)

df = resume_data[['Extracted_Name', 'Category', 'Raw_Details']]
df

Unnamed: 0,Extracted_Name,Category,Raw_Details
0,Anubhavsingh,PeopleSoft Resumes,anubhav kumar singh core competencies scriptin...
1,G Ananda Rayudu,PeopleSoft Resumes,g ananda rayudu httpswwwlinkedincominanandguda...
2,Gangareddy,PeopleSoft Resumes,peoplesoft database administrator gangareddy p...
3,Murali,PeopleSoft Resumes,classification internal classification interna...
4,Priyanka Ramadoss,PeopleSoft Resumes,priyanka ramadoss mountpleasant coonoor the ni...
...,...,...,...
74,Sri Krishna S,WorkDay Resumes,workday integration consultant name sri krishn...
75,Srikanth,WorkDay Resumes,srikanth workday hcm consultant seeking suitab...
76,Sskumar,WorkDay Resumes,workday hcm fcm name kumar ss role workday con...
77,Venkateswarlu B,WorkDay Resumes,venkateswarlub workday consultant professional...


In [45]:
df.to_excel('cleaned_resume.xlsx', index = False)

In [3]:
main_folder = '../Resumes/'

raw_data = []
categories = []

for subfolder in os.listdir(main_folder):
    folder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(folder_path):
        category = subfolder  
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.docx'):  
                file_path = os.path.join(folder_path, file_name)
                try:
                    raw_text = textract.process(file_path).decode('utf-8')
                    raw_data.append(raw_text)
                    categories.append(category)
                except Exception as e:
                    print(f"Error processing file {file_name} in {folder_path}: {e}")

resume_data = pd.DataFrame({'Raw_Details': raw_data,'Category': categories})
resume_data

Unnamed: 0,Raw_Details,Category
0,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...,PeopleSoft Resumes
1,G. Ananda Rayudu \n\n\t\t https://www.li...,PeopleSoft Resumes
2,PeopleSoft Database Administrator\n\n ...,PeopleSoft Resumes
3,Classification: Internal\n\nClassification: In...,PeopleSoft Resumes
4,"Priyanka Ramadoss\n\n\t61/46, MountPleasant, \...",PeopleSoft Resumes
...,...,...
74,Workday Integration Consultant\n\n\n\nName ...,WorkDay Resumes
75,Srikanth (WORKDAY hCM Consultant)\t ...,WorkDay Resumes
76,WORKDAY | HCM | FCM\n\nName \t\t: Kumar S.S\n\...,WorkDay Resumes
77,Venkateswarlu.B\t\t\t\t\t\t\t\tWorkday Consult...,WorkDay Resumes


In [5]:
name_keywords = [
    "Peoplesoft Admin", "PeopleSoft DBA", "Peoplesoft Finance", "Peoplesoft FSCM",
    "Resume", "Hexaware", "Internship", "Musquare Technologies", "React Developer", "Reactjs Developer", 
    "React JS Developer", "React Dev", "Hexaware", "converted"
]

def extract_name_from_filename(file_name):
    base_name = os.path.splitext(file_name)[0]
    base_name = re.sub(r'\[.*\]', '', base_name)

    for keyword in name_keywords:
        if keyword in base_name:
            base_name = base_name.replace(keyword, '')

    parts = base_name.split('_')

    if len(parts) > 1:
        name_part = ' '.join(part.strip() for part in parts if part.strip())
    else:
        name_part = base_name.strip()

    name_part = re.sub(r'[^a-zA-Z\s]', '', name_part).strip()
    return name_part.title() if name_part else None

In [7]:
main_folder = '../Resumes/'

raw_data = []
extracted_names = []

for subfolder in os.listdir(main_folder):
    folder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.docx') or file_name.endswith('.pdf'):
                raw_data.append(file_name)
                extracted_name = extract_name_from_filename(file_name)
                extracted_names.append(extracted_name)

extracted_data = pd.DataFrame({
    'File_Name': raw_data,
    'Extracted_Name': extracted_names
})

extracted_data

Unnamed: 0,File_Name,Extracted_Name
0,Peoplesoft Admin_AnubhavSingh.docx,Anubhavsingh
1,Peoplesoft Admin_G Ananda Rayudu.docx,G Ananda Rayudu
2,Peoplesoft Admin_Gangareddy.docx,Gangareddy
3,Peoplesoft Admin_Murali.docx,Murali
4,Peoplesoft Admin_Priyanka Ramadoss.docx,Priyanka Ramadoss
...,...,...
74,Sri Krishna S_Hexaware.docx,Sri Krishna S
75,Srikanth-Hexaware.docx,Srikanth
76,SSKumar_Hexaware.docx,Sskumar
77,Venkateswarlu B_Hexaware.docx,Venkateswarlu B


In [9]:
resume_data = pd.concat([resume_data, extracted_data[['Extracted_Name']]], axis=1)

df = resume_data[['Extracted_Name', 'Category', 'Raw_Details']]
df

Unnamed: 0,Extracted_Name,Category,Raw_Details
0,Anubhavsingh,PeopleSoft Resumes,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...
1,G Ananda Rayudu,PeopleSoft Resumes,G. Ananda Rayudu \n\n\t\t https://www.li...
2,Gangareddy,PeopleSoft Resumes,PeopleSoft Database Administrator\n\n ...
3,Murali,PeopleSoft Resumes,Classification: Internal\n\nClassification: In...
4,Priyanka Ramadoss,PeopleSoft Resumes,"Priyanka Ramadoss\n\n\t61/46, MountPleasant, \..."
...,...,...,...
74,Sri Krishna S,WorkDay Resumes,Workday Integration Consultant\n\n\n\nName ...
75,Srikanth,WorkDay Resumes,Srikanth (WORKDAY hCM Consultant)\t ...
76,Sskumar,WorkDay Resumes,WORKDAY | HCM | FCM\n\nName \t\t: Kumar S.S\n\...
77,Venkateswarlu B,WorkDay Resumes,Venkateswarlu.B\t\t\t\t\t\t\t\tWorkday Consult...


# Extracting Years of Experience

In [11]:
import re

def extract_experience(text):
    pattern = r'(\d+(\.\d+)?\+?)\s*(?:years|yrs|year|yr)\s*(?:of\s+experience|experience|exp)?'    
    match = re.findall(pattern, text.lower())
    if match:
        extracted_years = []
        for x in match:
            if x[0].endswith('+'):
                extracted_years.append(float(x[0][:-1]))  
            else:
                extracted_years.append(float(x[0]))  
        return max(extracted_years)  
    return None

In [13]:
df['Years_of_Experience'] = df['Raw_Details'].apply(extract_experience)

df[['Raw_Details', 'Extracted_Name', 'Category', 'Years_of_Experience']].head()
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Years_of_Experience'] = df['Raw_Details'].apply(extract_experience)


Unnamed: 0,Extracted_Name,Category,Raw_Details,Years_of_Experience
0,Anubhavsingh,PeopleSoft Resumes,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...,
1,G Ananda Rayudu,PeopleSoft Resumes,G. Ananda Rayudu \n\n\t\t https://www.li...,7.00
2,Gangareddy,PeopleSoft Resumes,PeopleSoft Database Administrator\n\n ...,4.00
3,Murali,PeopleSoft Resumes,Classification: Internal\n\nClassification: In...,6.00
4,Priyanka Ramadoss,PeopleSoft Resumes,"Priyanka Ramadoss\n\n\t61/46, MountPleasant, \...",4.11
...,...,...,...,...
74,Sri Krishna S,WorkDay Resumes,Workday Integration Consultant\n\n\n\nName ...,4.00
75,Srikanth,WorkDay Resumes,Srikanth (WORKDAY hCM Consultant)\t ...,4.00
76,Sskumar,WorkDay Resumes,WORKDAY | HCM | FCM\n\nName \t\t: Kumar S.S\n\...,6.00
77,Venkateswarlu B,WorkDay Resumes,Venkateswarlu.B\t\t\t\t\t\t\t\tWorkday Consult...,5.30


# Extracting Education Qualifications

In [15]:
education_patterns = [
    r"\bB\.Tech\b", r"\bB\-Tech\b", r"\bBTECH\b", r"\bB\.tech\b", r"\bB[.\s]*Tech\b" , r"\bEngineering\b" , r"\bB\.E\b", r"\bB\.Sc\b", r"\bB\.SC\b", r"\bBSC\b" , r"\bM\.Sc\b",  r"\bB\.sc\b", r"\bM\.Tech\b", r"\bM\.CA\b", r"\bMCA\b" ,r"\bM\.ca\b", r"\bMBA\b", r"\bBCA\b", r"\bBsc\b",
    r"\bB\.A\b", r"\bB\.C\.A\b" , r"\bM\.C\.A\b", r"\bB\.Com\b", r"\bBachelors\b", r"\bBACHELOR OF TECHNOLOGY\b" ,r"\bBachelor of Technology\b", r"\bBachelor of Engineering\b", 
    r"\bBachelor of Science\b",r"\bBachelor of Degree\b", r"\Degree\b", r"\bDEGREE\b" ,r"\bMasters\b",r"\bMaster of Computer Applications\b", r"\bMaster of computer applications\b" , r"\bMaster\b" , r"\bM\.com\b" , r"\bBachelor of computer application\b" , r"\bMaster of Technology\b", 
    r"\bMaster of Science\b", r"\bM\.B\.A\b", r"\bM\.A\b", r"\bM\.Com\b", r"\bM\.Engg\b"
]

def extract_education(text):
    education_degrees = []
    for pattern in education_patterns:
        found_degrees = re.findall(pattern, text)
        education_degrees.extend(found_degrees)
    return list(set(education_degrees))

df['Education'] = df['Raw_Details'].apply(extract_education)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Education'] = df['Raw_Details'].apply(extract_education)


Unnamed: 0,Extracted_Name,Category,Raw_Details,Years_of_Experience,Education
0,Anubhavsingh,PeopleSoft Resumes,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...,,"[B.Tech, Engineering]"
1,G Ananda Rayudu,PeopleSoft Resumes,G. Ananda Rayudu \n\n\t\t https://www.li...,7.00,[Bachelors]
2,Gangareddy,PeopleSoft Resumes,PeopleSoft Database Administrator\n\n ...,4.00,[]
3,Murali,PeopleSoft Resumes,Classification: Internal\n\nClassification: In...,6.00,[]
4,Priyanka Ramadoss,PeopleSoft Resumes,"Priyanka Ramadoss\n\n\t61/46, MountPleasant, \...",4.11,"[Engineering, B.E]"
...,...,...,...,...,...
74,Sri Krishna S,WorkDay Resumes,Workday Integration Consultant\n\n\n\nName ...,4.00,"[Bachelors, Engineering]"
75,Srikanth,WorkDay Resumes,Srikanth (WORKDAY hCM Consultant)\t ...,4.00,[]
76,Sskumar,WorkDay Resumes,WORKDAY | HCM | FCM\n\nName \t\t: Kumar S.S\n\...,6.00,"[M.Tech, B.Tech, Engineering]"
77,Venkateswarlu B,WorkDay Resumes,Venkateswarlu.B\t\t\t\t\t\t\t\tWorkday Consult...,5.30,"[Bachelor of Technology, B. Tech]"


# Extracting Skills

In [17]:
skills_dict = {
    "Programming Languages": [
        "C", "C++", "SQL", "T-SQL", "SAS", "R", "Python", 
        "PeopleCode", "HTML", "CSS", "Bootstrap", "Core Java", "XML", 
        "XSLT", "MVEL", "Basic Shell Scripting"
    ],
    "Databases": [
        "Oracle 10g", "Oracle 11g", "Oracle 12c", "MS SQL Server 2008R2", 
        "MS SQL Server 2012", "MS SQL Server 2014", "MS SQL Server 2016", 
        "MS SQL Server 2017", "AWS RDS (Athena)", "MariaDB", 
        "DB2", "Sybase ASE 15.7", "Hadoop Ecosystem (Hive, HBase)"
    ],
    "ERP Systems and Tools": [
        "PeopleSoft HRMS", "PeopleSoft FSCM", "PeopleSoft CRM", 
        "Workday HCM", "PeopleTools", "PeopleSoft Update Manager (PUM)", 
        "Change Assistant", "Integration Broker", "Application Designer"
    ],
    "Reporting Tools": [
        "SQR", "PS Query", "XML Publisher", "N-Vision", "BI Publisher", 
        "SQL Server Reporting Services (SSRS)", "Power BI", "Crystal Reports"
    ],
    "ETL Tools": [
        "SQL Server Integration Services (SSIS)", "Pentaho Data Integration", 
        "Talend", "Informatica"
    ],
    "Web Technologies": [
        "SOAP", "REST", "XML", "XSD", "Web Services (WSDL & SOAP)"
    ],
    "DevOps Tools": [
        "Jenkins", "Ansible", "Putty", "WinSCP", "Filezilla", 
        "AWS Tools (Compute, Storage, Networks, Monitoring)"
    ],
    "Operating Systems": [
        "Windows XP", "Windows 7", "Windows 8", "Windows 10", 
        "Windows Server 2008", "Windows Server 2012 R2", 
        "Red Hat Enterprise Linux (RHEL)", "Oracle Enterprise Linux (OEL)", 
        "Ubuntu", "Solaris", "Mac OS"
    ],
    "Development Tools": [
        "PeopleSoft Application Designer", "SQL Developer", "Toad", 
        "Microsoft SQL Management Studio (SSMS)", "RazorSQL", "HeidiSQL", 
        "Workday Studio", "Tectia Client", "Delphix Virtual Database"
    ],
    "Integration Tools": [
        "EIB (Enterprise Interface Builder)", "Core Connectors", 
        "Workday Studio", "Component Interface", "File Layout", 
        "Workflow Notifications"
    ],
    "Project Management Tools": [
        "JIRA", "ServiceNow", "Odyssey Dashboard", "Transporter"
    ],
    "Cloud Platforms": [
        "AWS", "AWS Redshift"
    ],
    "Middleware": [
        "WebLogic", "Tuxedo", "Apache Tomcat"
    ],
    "BI Tools": [
        "Microsoft Business Intelligence Tools (MSBI)", "Tableau"
    ],
    "Domain Knowledge": [
        "Finance", "HCM", "Retail", "Healthcare", "BFSI", "Media & Publication"
    ],
    "Soft Skills": [
        "Effective Communication", "Team Player", "Quick Learner", 
        "Problem-Solving", "Customer Interaction", "Self-Motivation"
    ]
}

def extract_skills(text,skills_dict):
    extracted_skills = {}
    for category, skills in skills_dict.items():
        matched_skills = [skill for skill in skills if skill in text]
        if matched_skills:
            extracted_skills[category] = matched_skills
    return extracted_skills

df['skills'] = df['Raw_Details'].apply(lambda x: extract_skills(x, skills_dict))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['skills'] = df['Raw_Details'].apply(lambda x: extract_skills(x, skills_dict))


Unnamed: 0,Extracted_Name,Category,Raw_Details,Years_of_Experience,Education,skills
0,Anubhavsingh,PeopleSoft Resumes,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...,,"[B.Tech, Engineering]","{'Programming Languages': ['C', 'R'], 'Databas..."
1,G Ananda Rayudu,PeopleSoft Resumes,G. Ananda Rayudu \n\n\t\t https://www.li...,7.00,[Bachelors],"{'Programming Languages': ['C', 'R'], 'Databas..."
2,Gangareddy,PeopleSoft Resumes,PeopleSoft Database Administrator\n\n ...,4.00,[],"{'Programming Languages': ['C', 'SQL', 'R'], '..."
3,Murali,PeopleSoft Resumes,Classification: Internal\n\nClassification: In...,6.00,[],"{'Programming Languages': ['C', 'C++', 'SQL', ..."
4,Priyanka Ramadoss,PeopleSoft Resumes,"Priyanka Ramadoss\n\n\t61/46, MountPleasant, \...",4.11,"[Engineering, B.E]","{'Programming Languages': ['C', 'SQL', 'R'], '..."
...,...,...,...,...,...,...
74,Sri Krishna S,WorkDay Resumes,Workday Integration Consultant\n\n\n\nName ...,4.00,"[Bachelors, Engineering]","{'Programming Languages': ['C', 'R', 'Core Jav..."
75,Srikanth,WorkDay Resumes,Srikanth (WORKDAY hCM Consultant)\t ...,4.00,[],"{'Programming Languages': ['C', 'R', 'XML', 'X..."
76,Sskumar,WorkDay Resumes,WORKDAY | HCM | FCM\n\nName \t\t: Kumar S.S\n\...,6.00,"[M.Tech, B.Tech, Engineering]","{'Programming Languages': ['C', 'SQL', 'R', 'X..."
77,Venkateswarlu B,WorkDay Resumes,Venkateswarlu.B\t\t\t\t\t\t\t\tWorkday Consult...,5.30,"[Bachelor of Technology, B. Tech]","{'Programming Languages': ['C', 'R', 'XML', 'X..."


In [19]:
df['skills'][77]

{'Programming Languages': ['C', 'R', 'XML', 'XSLT'],
 'ERP Systems and Tools': ['Workday HCM'],
 'Web Technologies': ['SOAP', 'XML'],
 'Operating Systems': ['Windows XP'],
 'Development Tools': ['Workday Studio'],
 'Integration Tools': ['Core Connectors', 'Workday Studio'],
 'Domain Knowledge': ['HCM']}

In [21]:
df['skills'][4]

{'Programming Languages': ['C', 'SQL', 'R'],
 'ERP Systems and Tools': ['Change Assistant',
  'Integration Broker',
  'Application Designer'],
 'Reporting Tools': ['SQR', 'N-Vision'],
 'DevOps Tools': ['Putty'],
 'Operating Systems': ['Solaris'],
 'Development Tools': ['SQL Developer'],
 'Project Management Tools': ['JIRA', 'Odyssey Dashboard', 'Transporter'],
 'Cloud Platforms': ['AWS'],
 'Middleware': ['Apache Tomcat'],
 'Domain Knowledge': ['Finance']}

# Extracting companies worked

In [23]:
company_names = [
    "HCL", "IDC Technologies Sol. (I) Pvt. Ltd.", "Texas Department of Transportation(TxDOT)", 'L N T INFOTECH', 'Regency Technologies', 'DXC Technologies',"Progile Infotech pvt ltd","Workday","Tyson Foods","NCR Corporation","TouchNet","Thermo Fisher Scientific","FX ABS Software Solutions Pvt. Ltd","2.	Eron Infoways Pvt Ltd","Tachus Software Solutions","Digee IT Networks",
    "Accenture Solutions Pvt Ltd.", "Accenture", "Capgemini", "Cognizant Technology Solutions", 'PeopleSoft',"Edvenswa tech Pvt. Ltd","BRAINO SERVICES","Smart Edge India Pvt Ltd","Wesatage eservices Pvt","AMBESTTECH SOLUTION","Metrolabs Services Pvt ltd", "Imagine Technology and Services Pvt. Ltd","FLUENTGRID LIMITED","Tietoevryindia","OSCORP Information systems","united health group",
    "Tech Mahindra Limited", "PeopleSoft DBA", "Wipro Technologies", "IBM", "Randstad", "Verizon","HSBC","BOSCH", "Blue Yonder","ENLUME TECHNOLOGIES","FORTUNAPIX PRIVATE LIMITD","Predifast Technologies Private Limited","Metrolabs Services Pvt ltd","Wipro ","Coginic Technologies pvt Ltd","Pride Technologies","Perigon Technologies Pvt Ltd","ITC InfoTech","Genpact","OSI Consulting",
    "Imagine Technology and Services Pvt. Ltd.", "Condé Nast", "TECHASOFT PVT LTD", "Zerozilla Software Company","QUAQUA Tavel Experiences pvt,Ltd","Amaravati Tech Services","Schemax Expert Techno Crafts Pvt. Ltd","Maganti IT Solutions","PickupBiz Solution Private Limited","Saffire Softtech","Fabex tech solution pvt Ltd","Tech Mahindra",
    "Square Bridge Technologies PVT LTD", "Perigon Technologies Pvt Ltd.", "People Tech Group","Icroz Solutions Pvt Ltd","INTEGRATED DATA SYSTEMS LTD","Jade Global","Thomson Reuters Corporation","Value Momentum","FireEye","Infosys","Tata Consultancy Services"
]

def extract_companies(text, company_names):
    companies = [company for company in company_names if company in text]
    return ', '.join(companies) if companies else 'Other'

df['Extracted_Companies'] = df['Raw_Details'].apply(lambda x: extract_companies(x, company_names))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Extracted_Companies'] = df['Raw_Details'].apply(lambda x: extract_companies(x, company_names))


Unnamed: 0,Extracted_Name,Category,Raw_Details,Years_of_Experience,Education,skills,Extracted_Companies
0,Anubhavsingh,PeopleSoft Resumes,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...,,"[B.Tech, Engineering]","{'Programming Languages': ['C', 'R'], 'Databas...","HCL, PeopleSoft, IBM"
1,G Ananda Rayudu,PeopleSoft Resumes,G. Ananda Rayudu \n\n\t\t https://www.li...,7.00,[Bachelors],"{'Programming Languages': ['C', 'R'], 'Databas...","IDC Technologies Sol. (I) Pvt. Ltd., Texas Dep..."
2,Gangareddy,PeopleSoft Resumes,PeopleSoft Database Administrator\n\n ...,4.00,[],"{'Programming Languages': ['C', 'SQL', 'R'], '...",PeopleSoft
3,Murali,PeopleSoft Resumes,Classification: Internal\n\nClassification: In...,6.00,[],"{'Programming Languages': ['C', 'C++', 'SQL', ...","L N T INFOTECH, Regency Technologies, DXC Tech..."
4,Priyanka Ramadoss,PeopleSoft Resumes,"Priyanka Ramadoss\n\n\t61/46, MountPleasant, \...",4.11,"[Engineering, B.E]","{'Programming Languages': ['C', 'SQL', 'R'], '...","Accenture Solutions Pvt Ltd., Accenture, Peopl..."
...,...,...,...,...,...,...,...
74,Sri Krishna S,WorkDay Resumes,Workday Integration Consultant\n\n\n\nName ...,4.00,"[Bachelors, Engineering]","{'Programming Languages': ['C', 'R', 'Core Jav...","Workday, Digee IT Networks"
75,Srikanth,WorkDay Resumes,Srikanth (WORKDAY hCM Consultant)\t ...,4.00,[],"{'Programming Languages': ['C', 'R', 'XML', 'X...","Workday, Wipro , Infosys"
76,Sskumar,WorkDay Resumes,WORKDAY | HCM | FCM\n\nName \t\t: Kumar S.S\n\...,6.00,"[M.Tech, B.Tech, Engineering]","{'Programming Languages': ['C', 'SQL', 'R', 'X...","Workday, PeopleSoft, Wipro , ITC InfoTech"
77,Venkateswarlu B,WorkDay Resumes,Venkateswarlu.B\t\t\t\t\t\t\t\tWorkday Consult...,5.30,"[Bachelor of Technology, B. Tech]","{'Programming Languages': ['C', 'R', 'XML', 'X...","Workday, PeopleSoft, Genpact, OSI Consulting"


# Extracting Professional Experience

In [25]:
proffessional_experience = [
    "Workday HCM","Workday Integration Consultant","Workday Technical Consultant","Workday Consultant","Workday Developer","workday consultant","workday Integration Consultant","Workday HCM Functional Consultant","Workday HCM Consultant","Workday  Functional Consultant",    
    "SQL & MSBI Developer","MS SQL developer","MS-SQL,PL/SQL-Oracle DEVELOPER","SQl & Power BI Developer","SQL Developer","oracle developer","DQL DEVELOPER","Sql Developer","Microsoft SQLServer","MICROSOFT SQL SERVER","SQL Server",
    "React JS Developer","Front End Developer","UI Developr","UI DEVELOPER","WEB DEVELOPER","Web Developer","React Developer","Web Developer","front end developer","React Developer","React.JS Developer",
    "PeopleSoft Administration","PeopleSoft DBA","PeopleSoft Administrator","Peoplesoft Admin","PeopleSoft technical consultant","PeopleSoft Finance","PeopleSoft Technical Consultant","PeopleSoft Consultant","PeopleSoft FSCM",
    "Software Engineer","software developer","Software Developer","System Engineer"
]

def extract_prof_experience(text, proffessional_experience):
    companies = [company for company in proffessional_experience if company in text]
    return ', '.join(companies) if companies else 'Other'

df['Professional_Experience'] = df['Raw_Details'].apply(lambda x: extract_prof_experience(x, proffessional_experience))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Professional_Experience'] = df['Raw_Details'].apply(lambda x: extract_prof_experience(x, proffessional_experience))


Unnamed: 0,Extracted_Name,Category,Raw_Details,Years_of_Experience,Education,skills,Extracted_Companies,Professional_Experience
0,Anubhavsingh,PeopleSoft Resumes,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...,,"[B.Tech, Engineering]","{'Programming Languages': ['C', 'R'], 'Databas...","HCL, PeopleSoft, IBM","PeopleSoft Administration, PeopleSoft FSCM"
1,G Ananda Rayudu,PeopleSoft Resumes,G. Ananda Rayudu \n\n\t\t https://www.li...,7.00,[Bachelors],"{'Programming Languages': ['C', 'R'], 'Databas...","IDC Technologies Sol. (I) Pvt. Ltd., Texas Dep...",PeopleSoft DBA
2,Gangareddy,PeopleSoft Resumes,PeopleSoft Database Administrator\n\n ...,4.00,[],"{'Programming Languages': ['C', 'SQL', 'R'], '...",PeopleSoft,"PeopleSoft Administrator, PeopleSoft FSCM"
3,Murali,PeopleSoft Resumes,Classification: Internal\n\nClassification: In...,6.00,[],"{'Programming Languages': ['C', 'C++', 'SQL', ...","L N T INFOTECH, Regency Technologies, DXC Tech...","PeopleSoft Administration, PeopleSoft Administ..."
4,Priyanka Ramadoss,PeopleSoft Resumes,"Priyanka Ramadoss\n\n\t61/46, MountPleasant, \...",4.11,"[Engineering, B.E]","{'Programming Languages': ['C', 'SQL', 'R'], '...","Accenture Solutions Pvt Ltd., Accenture, Peopl...","SQL Developer, PeopleSoft Administration, Peop..."
...,...,...,...,...,...,...,...,...
74,Sri Krishna S,WorkDay Resumes,Workday Integration Consultant\n\n\n\nName ...,4.00,"[Bachelors, Engineering]","{'Programming Languages': ['C', 'R', 'Core Jav...","Workday, Digee IT Networks","Workday HCM, Workday Integration Consultant, W..."
75,Srikanth,WorkDay Resumes,Srikanth (WORKDAY hCM Consultant)\t ...,4.00,[],"{'Programming Languages': ['C', 'R', 'XML', 'X...","Workday, Wipro , Infosys","Workday HCM, Workday Consultant"
76,Sskumar,WorkDay Resumes,WORKDAY | HCM | FCM\n\nName \t\t: Kumar S.S\n\...,6.00,"[M.Tech, B.Tech, Engineering]","{'Programming Languages': ['C', 'SQL', 'R', 'X...","Workday, PeopleSoft, Wipro , ITC InfoTech","Workday HCM, Workday Consultant, workday consu..."
77,Venkateswarlu B,WorkDay Resumes,Venkateswarlu.B\t\t\t\t\t\t\t\tWorkday Consult...,5.30,"[Bachelor of Technology, B. Tech]","{'Programming Languages': ['C', 'R', 'XML', 'X...","Workday, PeopleSoft, Genpact, OSI Consulting","Workday HCM, Workday Integration Consultant, W..."


In [27]:
df1 = df.rename(columns = {'Extracted_Name': 'Employee_Name', 'skills': 'Skills', 'Extracted_Companies': 'Companies_Worked'})
df1.head()

Unnamed: 0,Employee_Name,Category,Raw_Details,Years_of_Experience,Education,Skills,Companies_Worked,Professional_Experience
0,Anubhavsingh,PeopleSoft Resumes,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...,,"[B.Tech, Engineering]","{'Programming Languages': ['C', 'R'], 'Databas...","HCL, PeopleSoft, IBM","PeopleSoft Administration, PeopleSoft FSCM"
1,G Ananda Rayudu,PeopleSoft Resumes,G. Ananda Rayudu \n\n\t\t https://www.li...,7.0,[Bachelors],"{'Programming Languages': ['C', 'R'], 'Databas...","IDC Technologies Sol. (I) Pvt. Ltd., Texas Dep...",PeopleSoft DBA
2,Gangareddy,PeopleSoft Resumes,PeopleSoft Database Administrator\n\n ...,4.0,[],"{'Programming Languages': ['C', 'SQL', 'R'], '...",PeopleSoft,"PeopleSoft Administrator, PeopleSoft FSCM"
3,Murali,PeopleSoft Resumes,Classification: Internal\n\nClassification: In...,6.0,[],"{'Programming Languages': ['C', 'C++', 'SQL', ...","L N T INFOTECH, Regency Technologies, DXC Tech...","PeopleSoft Administration, PeopleSoft Administ..."
4,Priyanka Ramadoss,PeopleSoft Resumes,"Priyanka Ramadoss\n\n\t61/46, MountPleasant, \...",4.11,"[Engineering, B.E]","{'Programming Languages': ['C', 'SQL', 'R'], '...","Accenture Solutions Pvt Ltd., Accenture, Peopl...","SQL Developer, PeopleSoft Administration, Peop..."


In [29]:
df2 = df1.drop(columns = ['Raw_Details'])
df2.head()

Unnamed: 0,Employee_Name,Category,Years_of_Experience,Education,Skills,Companies_Worked,Professional_Experience
0,Anubhavsingh,PeopleSoft Resumes,,"[B.Tech, Engineering]","{'Programming Languages': ['C', 'R'], 'Databas...","HCL, PeopleSoft, IBM","PeopleSoft Administration, PeopleSoft FSCM"
1,G Ananda Rayudu,PeopleSoft Resumes,7.0,[Bachelors],"{'Programming Languages': ['C', 'R'], 'Databas...","IDC Technologies Sol. (I) Pvt. Ltd., Texas Dep...",PeopleSoft DBA
2,Gangareddy,PeopleSoft Resumes,4.0,[],"{'Programming Languages': ['C', 'SQL', 'R'], '...",PeopleSoft,"PeopleSoft Administrator, PeopleSoft FSCM"
3,Murali,PeopleSoft Resumes,6.0,[],"{'Programming Languages': ['C', 'C++', 'SQL', ...","L N T INFOTECH, Regency Technologies, DXC Tech...","PeopleSoft Administration, PeopleSoft Administ..."
4,Priyanka Ramadoss,PeopleSoft Resumes,4.11,"[Engineering, B.E]","{'Programming Languages': ['C', 'SQL', 'R'], '...","Accenture Solutions Pvt Ltd., Accenture, Peopl...","SQL Developer, PeopleSoft Administration, Peop..."


In [33]:
new_order = ["Employee_Name", "Category", "Education", "Skills", "Professional_Experience", "Companies_Worked", "Years_of_Experience"]

df3 = df2[new_order]
df3.head()

Unnamed: 0,Employee_Name,Category,Education,Skills,Professional_Experience,Companies_Worked,Years_of_Experience
0,Anubhavsingh,PeopleSoft Resumes,"[B.Tech, Engineering]","{'Programming Languages': ['C', 'R'], 'Databas...","PeopleSoft Administration, PeopleSoft FSCM","HCL, PeopleSoft, IBM",
1,G Ananda Rayudu,PeopleSoft Resumes,[Bachelors],"{'Programming Languages': ['C', 'R'], 'Databas...",PeopleSoft DBA,"IDC Technologies Sol. (I) Pvt. Ltd., Texas Dep...",7.0
2,Gangareddy,PeopleSoft Resumes,[],"{'Programming Languages': ['C', 'SQL', 'R'], '...","PeopleSoft Administrator, PeopleSoft FSCM",PeopleSoft,4.0
3,Murali,PeopleSoft Resumes,[],"{'Programming Languages': ['C', 'C++', 'SQL', ...","PeopleSoft Administration, PeopleSoft Administ...","L N T INFOTECH, Regency Technologies, DXC Tech...",6.0
4,Priyanka Ramadoss,PeopleSoft Resumes,"[Engineering, B.E]","{'Programming Languages': ['C', 'SQL', 'R'], '...","SQL Developer, PeopleSoft Administration, Peop...","Accenture Solutions Pvt Ltd., Accenture, Peopl...",4.11


# Saving the Dataframe to an Excel File

In [35]:
output_file = "extracted_details.xlsx"
df3.to_excel(output_file, index = False)