In [37]:
!pip install tika
import os
import re
import spacy
import nltk
from tika import parser
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import numpy as np



In [38]:
# Initialize spaCy and NLTK
nlp = spacy.load("en_core_web_sm")
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:

def extract_text_from_pdf(pdf_path):
    try:
        parsed_pdf = parser.from_file(pdf_path)
        return parsed_pdf['content']
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

# extract skills
def extract_skills(text):
    skills = []

    try:
        pattern = r"(?i)(\b(?:skill(?:s)?|experience)\b[\w\s,]+)"
        matches = re.findall(pattern, text)
        for match in matches:
            skills.append(match.strip())
    except Exception as s:
        print(f"Error extracting skill information: {str(s)}")
    return skills

categories = {
    'ACCOUNTANT': ['accountant', 'accounting', 'financial', 'auditing', 'bookkeeping'],
    'ADVOCATE': ['lawyer', 'legal', 'advocate', 'attorney', 'counsel'],
    'AGRICULTURE': ['agriculture', 'farming', 'agronomy', 'horticulture', 'crops'],
    'APPAREL': ['apparel', 'clothing', 'fashion', 'garments', 'textiles'],
    'ARTS': ['arts', 'creative', 'painting', 'sculpture', 'performing arts'],
    'AUTOMOBILE': ['automobile', 'automotive', 'vehicles', 'car', 'mechanic'],
    'AVIATION': ['aviation', 'airlines', 'pilot', 'aircraft', 'aerospace'],
    'BANKING': ['banking', 'finance', 'bank', 'investment', 'financial services'],
    'BPO': ['bpo', 'business process outsourcing', 'call center', 'customer support', 'outsourcing'],
    'BUSINESS-DEVELOPMENT': ['business development', 'sales', 'strategy', 'marketing', 'growth'],
    'CHEF': ['chef', 'cooking', 'culinary', 'restaurant', 'food'],
    'CONSTRUCTION': ['construction', 'building', 'architecture', 'contractor', 'civil engineering'],
    'CONSULTANT': ['consultant', 'advisory', 'expert', 'consulting', 'advisor'],
    'DESIGNER': ['designer', 'graphic design', 'web design', 'creative', 'visual'],
    'DIGITAL-MEDIA': ['digital media', 'media production', 'online', 'content', 'social media'],
    'ENGINEERING': ['engineering', 'engineer', 'mechanical', 'electrical', 'civil'],
    'FINANCE': ['finance', 'financial', 'investment', 'banking', 'portfolio management'],
    'FITNESS': ['fitness', 'health', 'wellness', 'trainer', 'exercise'],
    'HEALTHCARE': ['healthcare', 'medical', 'doctor', 'nurse', 'hospital'],
    'HR': ['hr', 'human resources', 'personnel', 'recruitment', 'employee relations'],
    'INFORMATION-TECHNOLOGY': ['information technology', 'it', 'software', 'technology', 'programming'],
    'PUBLIC-RELATIONS': ['public relations', 'pr', 'communications', 'media', 'branding'],
    'SALES': ['sales', 'selling', 'marketing', 'business development', 'customer relations'],
    'TEACHER': ['teacher', 'teaching', 'education', 'instructor', 'tutor']
}
#categorize resumes
def categorize_resume(text):
    # Iterate through categories and check if any keyword matches the text
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in text.lower():
                return category

    return "Other"

# extract education
def extract_education(text):
    education = []
    try:
        pattern = r"(?i)(\b(?:degree|university|college)\b[\w\s,]+)"
        matches = re.findall(pattern, text)
        for match in matches:
            education.append(match.strip())
    except Exception as e:
        print(f"Error extracting education information: {str(e)}")
    return education

data = []

pdf_directory = '/content/drive/MyDrive/AIML/data/data'

for root, dirs, files in os.walk(pdf_directory):
    for file in files:
        if file.endswith('.pdf'):
            pdf_path = os.path.join(root, file)
            text = extract_text_from_pdf(pdf_path)

            if text is not None and isinstance(text, str):
                skills = extract_skills(text)

                category = categorize_resume(text)

                education = extract_education(text)
            else:
                pass

            data.append({"File": file, "Skills": skills, "Category": category, "Education": education})
result_df=pd.DataFrame(data)
print(result_df)

              File                                             Skills  \
0     12587973.pdf  [Experience\nSubstitute Teacher 01, skills and...   
1     10527994.pdf  [Skills\nOutlook, Excel, Word, PowerPoint, Qui...   
2     12467531.pdf  [skills, knowledge, and experiences as a teach...   
3     11336022.pdf  [skills, work ethic, achievement where we it c...   
4     10504237.pdf  [Experience\n11, Skills\nAnatomy, aseptic tech...   
...            ...                                                ...   
2493  74126637.pdf  [skills for report and assessment writing\nExp...   
2494  54067174.pdf  [experience in\ninbound, outbound and blended ...   
2495  95714702.pdf  [experience in a variety of areas including hu...   
2496  74191424.pdf  [skills with a growing company, skill in the b...   
2497  78538268.pdf  [Skills, Experience\n10, skills for existing a...   

           Category                                          Education  
0          ADVOCATE        [UNIVERSITY City , Stat

In [41]:
result_df.to_csv('extracted_data.csv')

In [42]:
df = pd.read_csv('/content/extracted_data.csv')

In [43]:
from nltk.corpus import stopwords
def preprocess(txt):
    txt = txt.lower() if isinstance(txt, str) else ' '.join(txt).lower()
    txt = re.sub('[^a-zA-Z]', ' ', txt)
    txt = re.sub('http\S+\s*', ' ', txt)
    txt = re.sub('RT|cc', ' ', txt)
    txt = re.sub('#\S+', '', txt)
    txt = re.sub('@\S+', '  ', txt)
    txt = re.sub('\s+', ' ', txt)
    txt = nltk.tokenize.word_tokenize(txt)
    txt = [w for w in txt if not w in nltk.corpus.stopwords.words('english')]

    return ' '.join(txt)

In [44]:
# preprocessing text
df['skills'] = df['Skills'].apply(lambda w: preprocess(w))
df['education'] = df['Education'].apply(lambda w: preprocess(w))


In [45]:
print(df['skills'])
print(df['education'])

0       experience nsubstitute teacher skills appropri...
1       skills noutlook excel word powerpoint quickboo...
2       skills knowledge experiences teacher advance n...
3       skills work ethic achievement asset company ex...
4       experience n skills nanatomy aseptic technique...
                              ...                        
2493    skills report assessment writing nexperienced ...
2494    experience ninbound outbound blended environme...
2495    experience variety areas including human resou...
2496    skills growing company skill best possible way...
2497    skills experience n skills existing potential ...
Name: skills, Length: 2498, dtype: object
0                    university city state nbachelor arts
1                                              university
2       university california degree elementary educat...
3       college city state teachers education nhigh sc...
4       university washington university school educat...
                              

In [46]:
df

Unnamed: 0.1,Unnamed: 0,File,Skills,Category,Education,skills,education
0,0,12587973.pdf,"['Experience\nSubstitute Teacher 01', 'skills ...",ADVOCATE,"['UNIVERSITY City , State\nBachelor of Arts']",experience nsubstitute teacher skills appropri...,university city state nbachelor arts
1,1,10527994.pdf,"['Skills\nOutlook, Excel, Word, PowerPoint, Qu...",ARTS,['University ï¼'],skills noutlook excel word powerpoint quickboo...,university
2,2,12467531.pdf,"['skills, knowledge, and experiences as a teac...",ACCOUNTANT,"['University of California', 'degree Elementar...",skills knowledge experiences teacher advance n...,university california degree elementary educat...
3,3,11336022.pdf,"['skills, work ethic, achievement where we it ...",ARTS,"['College City , State Teachers Education\nHig...",skills work ethic achievement asset company ex...,college city state teachers education nhigh sc...
4,4,10504237.pdf,"['Experience\n11', 'Skills\nAnatomy, aseptic t...",DIGITAL-MEDIA,"['University of Washington ï¼', 'University Sc...",experience n skills nanatomy aseptic technique...,university washington university school educat...
...,...,...,...,...,...,...,...
2493,2493,74126637.pdf,['skills for report and assessment writing\nEx...,ADVOCATE,['University ï¼'],skills report assessment writing nexperienced ...,university
2494,2494,54067174.pdf,"['experience in\ninbound, outbound and blended...",ACCOUNTANT,[],experience ninbound outbound blended environme...,
2495,2495,95714702.pdf,['experience in a variety of areas including h...,ACCOUNTANT,"['University ï¼', 'University of Phoenix ï¼']",experience variety areas including human resou...,university university phoenix
2496,2496,74191424.pdf,"['skills with a growing company', 'skill in th...",ADVOCATE,"['University ï¼', 'University ï¼']",skills growing company skill best possible way...,university university


In [47]:
df.drop(columns=['Skills', 'Education', 'Unnamed: 0'], inplace=True)

In [48]:
df

Unnamed: 0,File,Category,skills,education
0,12587973.pdf,ADVOCATE,experience nsubstitute teacher skills appropri...,university city state nbachelor arts
1,10527994.pdf,ARTS,skills noutlook excel word powerpoint quickboo...,university
2,12467531.pdf,ACCOUNTANT,skills knowledge experiences teacher advance n...,university california degree elementary educat...
3,11336022.pdf,ARTS,skills work ethic achievement asset company ex...,college city state teachers education nhigh sc...
4,10504237.pdf,DIGITAL-MEDIA,experience n skills nanatomy aseptic technique...,university washington university school educat...
...,...,...,...,...
2493,74126637.pdf,ADVOCATE,skills report assessment writing nexperienced ...,university
2494,54067174.pdf,ACCOUNTANT,experience ninbound outbound blended environme...,
2495,95714702.pdf,ACCOUNTANT,experience variety areas including human resou...,university university phoenix
2496,74191424.pdf,ADVOCATE,skills growing company skill best possible way...,university university


In [49]:
df.to_csv('preprocessed.csv')