In [1]:
import os
import textract
import pandas as pd

# Data Gathering and Preprocessing

In [2]:
# All files in currnet working directory
os.listdir()

['Requirement document- Document Classification (1).docx (1).pdf',
 'main.ipynb',
 'Gathering data.ipynb',
 'Resumes']

In [2]:
# Print current working directory
os.getcwd()

'/home/k/Desktop/Project/Resume Classification System'

In [4]:
# all files in Resume Folder
os.listdir(f"{os.getcwd()}/Resumes")

['React Developer_Haripriya.docx',
 'React Developer_Naveen sadhu.docx',
 'Reactjs Developer_MD Khizaruddin Rauf _Musquare Technologies.docx',
 'Internship_Susovan Bag_Musquare Technologies.docx',
 'Internship_Ravali_Musquare Technologies (1).docx',
 'React Developer_Pragnya.docx',
 'Reactjs Developer_M Lokesh.docx',
 'Reactjs Developer_Prabakaran_Musquare Technologies.pdf',
 'Reactjs Developer_Pranish Sonone_Musquare Technologies.docx',
 'React Developer_Sarala Madasu-converted.docx',
 'React Developer_Kamalakar Reddy.docx',
 'React JS Developer_Venkatalakshmi (1)-converted.docx',
 'React JS Developer_AnjaniPriyadarshini.doc',
 'React Developer_Vinay Reddy.doc',
 'Reactjs Developer_Ranga Gaganam_Musquare Technologies.docx',
 'Reactjs Developer_M Lokesh Babu_Musquare Technologies.docx',
 'React Developer_Thirupathiamma.docx',
 'React JS Developer_KotaniDurgaprasad[3_1] (1)-converted.docx',
 'React Developer_PavasGoswami.doc',
 'Reactjs Developer_Shaik Abdul Sharuk_Musquare Technologies

Seems like those .doc/.docx/.pdf files are resumes of Reactjs Developer

In [6]:
# We can filter files and folder
resume_files=[x for x in os.listdir(f"{os.getcwd()}/Resumes") if ('.' in x) and ('Intern' not in x)]

In [7]:
resume_files

['React Developer_Haripriya.docx',
 'React Developer_Naveen sadhu.docx',
 'Reactjs Developer_MD Khizaruddin Rauf _Musquare Technologies.docx',
 'React Developer_Pragnya.docx',
 'Reactjs Developer_M Lokesh.docx',
 'Reactjs Developer_Prabakaran_Musquare Technologies.pdf',
 'Reactjs Developer_Pranish Sonone_Musquare Technologies.docx',
 'React Developer_Sarala Madasu-converted.docx',
 'React Developer_Kamalakar Reddy.docx',
 'React JS Developer_Venkatalakshmi (1)-converted.docx',
 'React JS Developer_AnjaniPriyadarshini.doc',
 'React Developer_Vinay Reddy.doc',
 'Reactjs Developer_Ranga Gaganam_Musquare Technologies.docx',
 'Reactjs Developer_M Lokesh Babu_Musquare Technologies.docx',
 'React Developer_Thirupathiamma.docx',
 'React JS Developer_KotaniDurgaprasad[3_1] (1)-converted.docx',
 'React Developer_PavasGoswami.doc',
 'Reactjs Developer_Shaik Abdul Sharuk_Musquare Technologies.docx',
 'React Dev_Krishna Kanth.docx',
 'React Developer_Deepakreddy.docx',
 'React Dev_Krishna Kanth_Mus

In [3]:
# We will move those resumes to another folder named 'Reactjs'

src_dir = f"{os.getcwd()}/Resumes"
dst_dir = f'{src_dir}/Reactjs'

In [9]:
# creating a new directory
os.makedirs(dst_dir)

In [10]:
# moving all those files    
for i in resume_files:
    os.rename(src=f'{src_dir}/{i}',dst=f'{dst_dir}/{i}')           # os.replace may work

All files successfully moved to destination(Reactjs) folder.<br>
Lets see how we can read doc/pdf files in python

In [11]:
os.listdir(src_dir)

['Internship_Susovan Bag_Musquare Technologies.docx',
 'Internship_Ravali_Musquare Technologies (1).docx',
 'SQL Developer Lightning insight',
 'Peoplesoft resumes',
 'workday resumes',
 'Reactjs']

These are the working directories we have. <br>
Lets make a dataframe to store all those resume texts and will label these acoording to the folder names.
Also we are storing file types.

In [12]:
# Creating a DataFrame and defining required columns
resumes_df = pd.DataFrame(columns=['Texts','Category','Doc Type'])

In [13]:
for folder in os.listdir(src_dir):                                  # Iterating through each resume folder
    
    if ('.' in folder) | ('Validation' in folder):                  # Filtering only folders also we will make another validation set
        continue
    for resume in os.listdir(f"{src_dir}/{folder}"):                # now each resume in that folder
        
        txts = textract.process(f"{src_dir}/{folder}/{resume}")     # This will extract all the texts from the resume
        txts=txts.decode('UTF-8')                                   # Decoding to unicode characters
        # print(txts)
        resumes_df.loc[len(resumes_df)] = [
                                           txts,                    # storing all texts
                                           folder,                  # storing the folder name, which will be our category
                                           resume.split('.')[-1]    # This is the type of document
                                           ]

In [14]:
resumes_df.head()

Unnamed: 0,Texts,Category,Doc Type
0,ANIL KUMAR MADDUKURI \t\t\n\nSQL & MSBI Devel...,SQL Developer Lightning insight,docx
1,RAJU PAVANA KUMARI\n\n\n\n\t\n\n\tProfessional...,SQL Developer Lightning insight,docx
2,SQL AND MSBI DEVELOPER\n\nSQL AND MSBI DEVELOP...,SQL Developer Lightning insight,docx
3,\n\nCareer objective\n\n\nA rewarding opportun...,SQL Developer Lightning insight,doc
4,\n SQL S...,SQL Developer Lightning insight,doc


In [None]:
# storing to csv file
resumes_df.to_csv('resumes_df.csv',index=False)

In [23]:
# Also we have downloaded some resumes from the web to validate our model
src_dir

'/home/k/Desktop/Project/Resume Classification System/Resumes'

In [30]:
validation_df=pd.DataFrame(columns=['Texts','Category'])

In [31]:
validation_df

Unnamed: 0,Texts,Category


In [32]:
val_dir=src_dir+'/Validation Resume'

In [33]:
os.listdir(val_dir)

['React Developer Resume.docx',
 '5. React Front End Developer.pdf',
 'react-front-end-developer2 - Template 14.pdf',
 'sql-server-developer - Template 16.pdf',
 'junior-sql-developer2  - Template 14.pdf',
 '2. Junior SQL Developer.pdf']

In [34]:
for resume in os.listdir(val_dir):
    txts = textract.process(f"{val_dir}/{resume}")    
    txts=txts.decode('UTF-8')                                   
    # print(txts)
    validation_df.loc[len(validation_df)] = [
                                             txts,
                                             'Reactjs' if 'eact' in resume else 'SQL'              
                                                 
                                        ]

In [35]:
validation_df

Unnamed: 0,Texts,Category
0,Jake Wilson\n\n(555) 555-5555 | jake@email.com...,Reactjs
1,"FIRST LAST\nBay Area, California • +1-234-456-...",Reactjs
2,First Last\nReact Front End Developer\nAugusta...,Reactjs
3,"First Last\nSQL Server Developer\nBay Area, Ca...",SQL
4,"First Last\nJunior SQL Developer\nBurlington, ...",SQL
5,"FIRST LAST\nBay Area, California • +1-234-456-...",SQL


In [36]:
validation_df.to_csv('validation_df.csv',index=False)