In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('./resume_dataset_1.csv')
df.head()

Unnamed: 0,ID,Category,Resume
0,1,HR,"b'John H. Smith, P.H.R.\n800-991-5187 | PO Box..."
1,2,HR,b'Name Surname\nAddress\nMobile No/Email\nPERS...
2,3,HR,b'Anthony Brown\nHR Assistant\nAREAS OF EXPERT...
3,4,HR,b'www.downloadmela.com\nSatheesh\nEMAIL ID:\nC...
4,5,HR,"b""HUMAN RESOURCES DIRECTOR\n\xef\x82\xb7Expert..."


In [4]:
#Remove pucntuation in Resume
import string
new_resume_list = []
for resume in df['Resume']:
    new_r = resume.translate(str.maketrans('', '', string.punctuation))
    new_resume_list.append(new_r)

new_resume = np.asarray(new_resume_list)
df['Normalized_Resume'] = new_resume


# Extract text

In [5]:
import PyPDF2
import textract

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def pdf2txt(file_path: str) -> str:
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        texts = ''
        np = pdf_reader.getNumPages()
        for i in range(np):
            texts += pdf_reader.getPage(i).extractText()
    return texts

# Model Selection

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    SVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

In [7]:
#Calculate Term Frequency, Inverse Document Frequency
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf = True, min_df =7, norm = 'l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.Normalized_Resume).toarray()
labels = df.ID
features.shape

(1219, 9626)

In [8]:
from sklearn.feature_selection import chi2

N = 2
category_id_df = df[['Category','ID']].drop_duplicates().sort_values('ID')
category_to_id = dict(category_id_df.values)
for Category, ID in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == ID)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Category))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'Accountant':
  . Most correlated unigrams:
. equal
. faced
  . Most correlated bigrams:
. account executive
. various clients
# 'Advocate':
  . Most correlated unigrams:
. protect
. doctoral
  . Most correlated bigrams:
. law university
. public law
# 'Agricultural':
  . Most correlated unigrams:
. animal
. feed
  . Most correlated bigrams:
. xe2x80x93 2005
. experience job
# 'Apparel':
  . Most correlated unigrams:
. fabric
. fame
  . Most correlated bigrams:
. promotional materials
. institute art
# 'Architects':
  . Most correlated unigrams:
. contains
. schematic
  . Most correlated bigrams:
. year experience
. mechanical electrical
# 'Arts':
  . Most correlated unigrams:
. paint
. blend
  . Most correlated bigrams:
. fulltime position
. commonly used
# 'Automobile':
  . Most correlated unigrams:
. damaged
. repaired
  . Most correlated bigrams:
. institute technology
. xe2x80x93 2004
# 'Aviation':
  . Most correlated unigrams:
. documentary
. depaul
  . Most correlated bigrams:

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


X_train, X_test, y_train, y_test = train_test_split(df['Resume'], df['Category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [10]:
clf.predict(count_vect.transform(X_test))

array(['Engineering', 'Engineering', 'Education', 'Engineering',
       'Engineering', 'Engineering', 'Engineering', 'Engineering',
       'Engineering', 'Engineering', 'Engineering', 'Education',
       'Engineering', 'Engineering', 'Engineering', 'Engineering',
       'Engineering', 'Engineering', 'Engineering', 'Engineering',
       'Engineering', 'Engineering', 'Engineering', 'Education',
       'Engineering', 'Engineering', 'Engineering', 'Engineering',
       'Engineering', 'Engineering', 'Engineering', 'Engineering',
       'Engineering', 'Engineering', 'Engineering', 'Engineering',
       'Education', 'Engineering', 'Engineering', 'Engineering',
       'Education', 'Engineering', 'Engineering', 'Engineering',
       'Education', 'Engineering', 'Engineering', 'Engineering',
       'Engineering', 'Engineering', 'Education', 'Engineering',
       'Education', 'Engineering', 'Engineering', 'Engineering',
       'Engineering', 'Engineering', 'Engineering', 'Engineering',
       'Eng

In [None]:
y_test