# Case Study 7

### Import Libraries and Get Data

In [1]:
import pandas as pd
import pdfplumber
import re
import os
import docx
import win32com.client

In [2]:
#set Data Path
path = "C:/Users/Admin/Downloads/Sample2"

In [3]:
#view files present at provided path
os.listdir(path)

['AarushiRohatgi.pdf',
 'AkashGoel.docx',
 'AkashSharma.pdf',
 'AnamRehman.docx',
 'AnanyaDas.pdf',
 'AnkitDadwal.pdf',
 'AnshulTiwari.pdf',
 'BHAWANISINGH.pdf',
 'CAChamanKumar.pdf',
 'CAMonuKumarGupta.pdf',
 'DeeptiDawani.pdf',
 'DineshKumar.pdf',
 'eepeshGuljani.pdf',
 'GauravKhurana.pdf',
 'heemSen.doc',
 'ManrajMeena.doc',
 'MINTUKMUAR.doc',
 'NavinShakti.doc',
 'RamanKumar.doc',
 'RohitBhatt.doc',
 'Satyadev.doc',
 'VijayKumarS.doc']

### Define Necessary Methods

In [4]:
#method to convert .doc file to .docx file
def convert_doc_to_docx(doc_file_path, docx_file_path):
    word = win32com.client.Dispatch("Word.Application")
    doc = word.Documents.Open(doc_file_path)
    doc.SaveAs(docx_file_path, FileFormat=16)
    doc.Close()
    word.Quit()
        
#convert_doc_to_docx(path+'/X.doc',path+'/X')

In [5]:
#method to read contents of .docx file
def read_docx(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return '\n'.join(fullText)

#print(read_docx(path+'/X.docx'))

In [6]:
#method to read contents of .pdf file
def read_pdf(filename):
    all_text = ' '
    with pdfplumber.open(filename) as pdf:
        for pdf_page in pdf.pages:
            single_page_text = pdf_page.extract_text()
            if single_page_text is not None:
                all_text = all_text + '\n' + '--------------------------------------------' + '\n' + single_page_text
    return all_text

#print(read_pdf(path+'/X.pdf'))

### Extract Data Into Excel File

**Define Regular Expression Pattern for Data Extraction**

In [7]:
#regex pattern for email
pattern1 = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'

#regex pattern for contact number
pattern2 = r'(?:(?:\+\d{1,2}\s?)?(?:\d{3}[-\s]?)?\d{2,3}[-\s]?\d{4})'
pattern3 = r'(?:(?:\+\d{1,2}\s)?(?:\d{5}[\s]?)?(?:\d{5}))'

In [8]:
def get_data_from_resume():
    to_be_removed = []
    data = []

    for file in os.listdir(path):
        temp = []

        #get file data as per file extension
        if file.find('.pdf')>0:
            all_text = read_pdf(path+'/'+file)
        elif file.find('.docx')>0:
            all_text = read_docx(path+'/'+file)
        elif file.find('.doc')>0:
            temp_fname = file[:-4]
            convert_doc_to_docx(path+'/'+file, path+'/'+temp_fname)
            to_be_removed.append(path+'/'+temp_fname+'.docx')
            all_text = read_docx(path+'/'+temp_fname+'.docx')

        #extract patterns
        email = re.findall(pattern1,all_text)
        phone_ = re.findall(pattern2,all_text)
        phone = [i for i in phone_ if len(i)>=10]
        if len(phone)==0:
            phone = re.findall(pattern3,all_text)
        else:
            if len(phone[0])<10:
                phone = re.findall(pattern3,all_text)

        #assemble data
        temp.append(file)
        if len(email)>0:
            temp.append(email[0])
        else:
            temp.append('')
        if len(phone)>0:
            temp.append(phone[0])
        else:
            temp.append('')
        temp.append(all_text)
        data.append(temp)

    #remove intermediate generated files
    for i in to_be_removed:
        os.remove(i)
        
    #form dataframe
    df = pd.DataFrame(data, columns=['File Name', 'Email Id', 'Contact Number', 'Other Data'])
    
    return df

In [9]:
data = get_data_from_resume()
data

Unnamed: 0,File Name,Email Id,Contact Number,Other Data
0,AarushiRohatgi.pdf,aarushi.9999218543@gmail.com,999-921-8543,\n-------------------------------------------...
1,AkashGoel.docx,akashg2494@gmail.com,9310631244,"AKASH GOEL\nMobile: 9310631244, E-Mail: akashg..."
2,AkashSharma.pdf,akashsharma1894@gmail.com,80724 5855,\n-------------------------------------------...
3,AnamRehman.docx,anamr894@gmail.com,+91 8586089916,+91 8586089916 \n\nanamr894@gmail.com \n\nAnam...
4,AnanyaDas.pdf,das.ananya016@gmail.com,9643544185,\n-------------------------------------------...
5,AnkitDadwal.pdf,ankitdadwal94@gmail.com,8802955934,\n-------------------------------------------...
6,AnshulTiwari.pdf,anshultiwari734@gmail.com,6396164322,\n-------------------------------------------...
7,BHAWANISINGH.pdf,E-Mailid-bsraorajput@gmail.com,9024599166,\n-------------------------------------------...
8,CAChamanKumar.pdf,kumarchaman83@gmail.com,9953996291,\n-------------------------------------------...
9,CAMonuKumarGupta.pdf,Kumar.monu111992@gmail.com,9716910273,\n-------------------------------------------...


In [10]:
data.to_excel('Resume_data.xlsx')