# **Importing Necessary Libraries**

In [1]:
!pip install docx2txt
!pip install tika
import os
import re
import nltk
import docx2txt
from tika import parser
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3960 sha256=bda6ae3436daa1fbbfe40263e62ae26c4b5347f4f51891f7e89b7b7804a2d9f9
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8
Collecting tika
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tika
  Building wheel for tika (setup.py) ... [?25l[?25hdone
  Created wheel for tika: filename=tika-2.6.0-py3-none-any.whl size=32621 sha256=3f8beb9d6b0ab66b6aacf127407b9116b7c99198ec4f475a5e60e366d7258e73
  Stored in directory: /root/.cache/pip/wheels/5f/71/

In [2]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Parsing through the content for text extraction**

In [6]:
#function for extracting text from pdf
def extract_text_from_pdf(pdf_path):
    try:
        parsed_pdf = parser.from_file(pdf_path)
        return parsed_pdf['content']
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

#function for extracting text from documents
def extract_text_from_doc(doc_path):
    try:
        text = docx2txt.process(doc_path)
        return text
    except Exception as e:
        print(f"Error extracting text from {doc_path}: {str(e)}")
        return ""
#initializing a list for storing the parsed content
data = []
categories = []
#looping through the directory for extracting the details
directory = '/content/drive/MyDrive/Resume_Parser/data1'

for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        category = os.path.basename(root)
        if file.endswith('.pdf'):
            txt = extract_text_from_pdf(file_path) #function call
            if txt:
              data.append((category, file, txt))
            else:
              print("no text")
        elif file.endswith('.docx'):
            txt1 = extract_text_from_doc(file_path) #function call
            if txt1:
              data.append((category, file, txt1))
            else:
              print("no text")

# Printing the extracted text
for category, file_name, text_content in data:
    print(f"Category: {category}")
    print(f"File: {file_name}")
    print(text_content)
    print("\n=====================\n")

Category: apparel
File: 10182582.docx
KEY HOLDER Summary 



Highly organized efficient in multitasking environments; able to prioritize effectively to accomplish objectives with creativity, enthusiasm and humor. Resourceful and flexible, able to adapt to changing priorities and maintain a positive attitude with strong work ethic. Highlights 



Microsoft Outlook, Word and Excel Skilled trainer MS Office expert Customer-focused Strong interpersonal skills Effective workflow management Accomplished manager Goal-oriented Positive and upbeat 



Experience 



Key Holder 08/2012 to Current Company Name City , State 



Trained all incoming sales team members. 



Promptly resolved all customer requests, questions and complaints. 



Maintained knowledge of current sales and promotions, policies regarding payment and exchanges and security practices. Assumed ownership over team productivity and managed work flow to meet or exceed quality service goals. Trained staff on operating procedures

# **Text preprocessing using NLP regular expressions**

In [7]:
def preprocess(txt):
    txt = txt.lower() if isinstance(txt, str) else ' '.join(txt).lower() #make lower
    txt = re.sub(r'\\n', ' ', txt) # remove /n
    txt = re.sub('(http\S+|www.\S+)', ' ', txt)  # Remove URLs
    txt = re.sub('#\S+', '', txt) # remove #
    txt = re.sub('@\S+', ' ', txt) # remove @
    txt = re.sub(r'[\/,.:❖•;]', ' ', txt)  # Remove the specified special characters
    txt = re.sub('\s+', ' ', txt) # replace multiple whitespace characters (such as spaces, tabs, newlines, etc.)
    txt = re.sub(r'\uf0d8', '', txt) # remove \uf0d8
    txt = nltk.tokenize.word_tokenize(txt) # tokenize the text
    txt = [w for w in txt if not w in nltk.corpus.stopwords.words('english')]

    return ' '.join(txt)

In [8]:
# print the preprocesed text
for category, file_name, text_content in data:
    print(f"category: {category}")
    print(f"Processing file: {file_name}")
    preprocessed_text = preprocess(text_content) #function call
    print(f"Preprocessed text: {preprocessed_text}")
    print("\n=====================\n")

category: apparel
Processing file: 10182582.docx
Preprocessed text: key holder summary highly organized efficient multitasking environments able prioritize effectively accomplish objectives creativity enthusiasm humor resourceful flexible able adapt changing priorities maintain positive attitude strong work ethic highlights microsoft outlook word excel skilled trainer ms office expert customer-focused strong interpersonal skills effective workflow management accomplished manager goal-oriented positive upbeat experience key holder 08 2012 current company name city state trained incoming sales team members promptly resolved customer requests questions complaints maintained knowledge current sales promotions policies regarding payment exchanges security practices assumed ownership team productivity managed work flow meet exceed quality service goals trained staff operating procedures company services prepared opening closing store collections specialist 02 2010 04 2011 company name city s

# **Extracting contact details**

In [9]:
# functions to extract mob number
def extract_phone_numbers(text):
    phone_pattern = r"\b(?:\d[ -]?){9,15}\b"
    return re.findall(phone_pattern, text)

#function to extract email id
def extract_email_addresses(text):
    email_pattern = re.compile(r'[\w\.-]+@[\w\.-]+')
    match = email_pattern.search(text)
    return match.group() if match else None

# print contact details from each resume using the fuction calls
for category, file_name, text_content in data:
    print(f"category: {category}")
    print(f"Processing file: {file_name}")
    print("Contact Details:")
    phone_number = extract_phone_numbers(text_content) #function call
    email_address = extract_email_addresses(text_content) #function call
    print(phone_number)
    print(email_address)
    print("\n=====================\n")


category: apparel
Processing file: 10182582.docx
Contact Details:
[]
None


category: apparel
Processing file: 10562768.docx
Contact Details:
[]
None


category: apparel
Processing file: 10738095.docx
Contact Details:
[]
None


category: apparel
Processing file: 10876132.docx
Contact Details:
[]
None


category: apparel
Processing file: 11232471.docx
Contact Details:
[]
None


category: accountant
Processing file: 10554236.pdf
Contact Details:
[]
None


category: accountant
Processing file: 11759079.pdf
Contact Details:
[]
None


category: accountant
Processing file: 11163645.pdf
Contact Details:
['2 864-472-7092']
None


category: accountant
Processing file: 10674770.pdf
Contact Details:
[]
None


category: accountant
Processing file: 12065211.pdf
Contact Details:
[]
None


category: hr
Processing file: 10399912.pdf
Contact Details:
[]
None


category: hr
Processing file: 11698189.pdf
Contact Details:
[]
None


category: hr
Processing file: 11480899.pdf
Contact Details:
[]
None


cate

# **Extract skills**

In [10]:
#function for extracting skills
def extract_skills(text):
    skills_keywords = ['skills', 'technical skills', 'personal skills']
    end_keywords = ['contact', 'profile', 'interest', 'internship', 'education', 'work history', 'work', 'experience', 'summary', 'achievement', 'achievements', 'certifications', 'projects', 'accomplishments', 'accomplishment', 'work experience', 'qualification']  # Keywords to stop extraction

    skill_sections = []
    used_keywords = set()  # Store used keywords to avoid duplicates
    last_end_index = 0  # Store the last end index to begin searching from there

# finds the keywords and extract the content in that
    for keyword in skills_keywords:
        if keyword not in used_keywords:
            start_index = text.find(keyword, last_end_index)
            if start_index != -1:
                end_index = len(text)
                for end_key in end_keywords:
                    end_index_candidate = text.find(end_key, start_index + len(keyword))
                    if end_index_candidate != -1 and end_index_candidate < end_index:
                        end_index = end_index_candidate
                extracted_text = text[start_index + len(keyword):end_index].strip()
                if extracted_text:
                    skill_sections.append(extracted_text)
                used_keywords.add(keyword)
                last_end_index = end_index

    return skill_sections


# **Extract Education**

In [11]:
#function for extracting education
def extract_education(text):
    education_keywords = ['education', 'degree', 'university', 'college', 'qualification']
    end_keywords = ['contact', 'profile', 'interest', 'internship', 'languages', 'skills', 'work history', 'experience', 'summary', 'achievement', 'achievements', 'certifications', 'projects', 'accomplishments', 'accomplishment', 'work experience']  # Keywords to stop extraction

    education_sections = []
    used_keywords = set()
    last_end_index = 0

# finds the keywords and extract the content in that
    for keyword in education_keywords:
        if keyword not in used_keywords:
            start_index = text.find(keyword, last_end_index)
            if start_index != -1:
                end_index = len(text)
                for end_key in end_keywords:
                    end_index_candidate = text.find(end_key, start_index + len(keyword))
                    if end_index_candidate != -1 and end_index_candidate < end_index:
                        end_index = end_index_candidate
                extracted_text = text[start_index + len(keyword):end_index].strip()
                if extracted_text:
                    education_sections.append(extracted_text)
                used_keywords.add(keyword)
                last_end_index = end_index

    return education_sections


# **Extract Work Experience**

In [12]:
#function for extracting work experience
def extract_work_experience(text):
    experience_keywords = ['internship', 'professional experience', 'experience', 'work experience', 'employment', 'work history']
    end_keywords = ['contact', 'profile', 'interest', 'skills', 'languages', 'technical skills', 'education', 'summary', 'achievement', 'certifications', 'projects', 'accomplishments', 'accomplishment', 'qualification']  # Keywords to stop extraction

    experience_sections = []
    used_keywords = set()
    last_end_index = 0

#finds the keywords and extract content in that
    for keyword in experience_keywords:
        if keyword not in used_keywords:
            start_index = text.find(keyword, last_end_index)
            if start_index != -1:
                end_index = len(text)
                for end_key in end_keywords:
                    end_index_candidate = text.find(end_key, start_index + len(keyword))
                    if end_index_candidate != -1 and end_index_candidate < end_index:
                        end_index = end_index_candidate

                extracted_text = text[start_index + len(keyword):end_index].strip()  # Exclude the keyword itself
                if extracted_text:  # Check if the extracted text is not empty
                    experience_sections.append(extracted_text)
                used_keywords.add(keyword)
                last_end_index = end_index

    return experience_sections


In [13]:
#loop through the preprocessed text and extract the details by calling the fuctions
for category, file_name, text_content in data:
    print(f"Processing file: {file_name}")

    # Preprocess text_content
    preprocessed_text = preprocess(text_content)

    # Extract skills
    extracted_skills = extract_skills(preprocessed_text) #function call
    print("Extracted Skills:")
    print(extracted_skills)

    # Extract education
    extracted_education = extract_education(preprocessed_text) #function call
    print("Extracted Education:")
    print(extracted_education)

    # Extract work experience
    extracted_experience = extract_work_experience(preprocessed_text) #function call
    print("Extracted Work Experience:")
    print(extracted_experience)

    print("\n=====================\n")


Processing file: 10182582.docx
Extracted Skills:
['effective', 'customer service debit fashion inventory excel ms office microsoft outlook word policies quality real estate sales trainer']
Extracted Education:
['certificate current temple real estate school marketing 2008 virginia union university city state marketing']
Extracted Work Experience:
['key holder 08 2012 current company name city state trained incoming sales team members promptly resolved customer requests questions complaints maintained knowledge current sales promotions policies regarding payment exchanges security practices assumed ownership team productivity managed work flow meet exceed quality service goals trained staff operating procedures company services prepared opening closing store collections specialist 02 2010 04 2011 company name city state effectively managed high-volume inbound outbound customer calls evaluated initiated alternatives resolving account balances responded customer inquires regarding account

In [14]:
columns = ['Category', 'File Name', 'Text Content', 'Preprocessed Text', 'Phone Number', 'Email Address', 'Skills', 'Education', 'Work Experience']

data_dict = {
    'Category': [resume[0] for resume in data],
    'File Name': [resume[1] for resume in data],
    'Text Content': [resume[2] for resume in data],
    'Preprocessed Text': [preprocess(resume[2]) for resume in data],
    'Phone Number': [' '.join(extract_phone_numbers(resume[2])) if extract_phone_numbers(resume[2]) else '' for resume in data],
    'Email Address': [extract_email_addresses(resume[2]) for resume in data],
    'Skills': [' '.join(extract_skills(preprocess(resume[2]))) if extract_skills(preprocess(resume[2])) else '' for resume in data],
    'Education': [' '.join(extract_education(preprocess(resume[2]))) if extract_education(preprocess(resume[2])) else '' for resume in data],
    'Work Experience': [' '.join(extract_work_experience(preprocess(resume[2]))) if extract_work_experience(preprocess(resume[2])) else '' for resume in data]
}

df = pd.DataFrame(data_dict, columns=columns)

# Save DataFrame as CSV
csv_filename = '/content/drive/MyDrive/Resume_Parser/extracted_data_1.csv'

print(f"Extracted data saved to {csv_filename}")



Extracted data saved to /content/drive/MyDrive/Resume_Parser/extracted_data_1.csv
