In [None]:
# random
import random
# math
import math
# sort
import operator
# clean dataset
import re
#read csv
import pandas as pd
# numpy
import numpy as np

# read csv file
import csv
# encode classes
from sklearn.preprocessing import LabelEncoder
# encode the resumes
from sklearn.feature_extraction.text import TfidfVectorizer

# train and split the dataset
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn import svm

from sklearn.linear_model import LogisticRegression

from sklearn import tree

from sklearn.model_selection import KFold



In [None]:
class Utils:

  def read_csv(filename):
    """Reads a given dataset, in our example resume_dataset.csv.

    Args:
      filename: a string, the name of the input file.

    Returns:
      A pair (classes, resumes) of lists:
      - classes is a list of N class or category (text): each class represent a particular resume domain
        N is the number of resumes (eg. lines).
      - resumes is a list of N resumes (text): each resume represent a cv.
    """

    classes = []
    resumes = []

    with open(filename) as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      line_count = 0
      for row in csv_reader:
          if (line_count != 0 and row != None):
              # Fill class list
              classes.append(row[0])
              # Fill resume list
              resumes.append(row[1])
          line_count += 1

    return (classes, resumes)


  def count_classes(classes):
    """Count the number of occurence of each class

      Args:
        classes: a list of categories read from a dataset
      Returns:
        class_dict: a dictionary (classes, count)
        this dictionary gives the occurence number of each class 
    """

    class_dict = {}
    for cl in classes:
      if (cl not in class_dict):
        class_dict[cl] = 1
      else:
        class_dict[cl] += 1
    
    class_dict = sorted(class_dict.items(), key=operator.itemgetter(1), reverse=True)
    return class_dict


In [None]:
(classes, resumes) = Utils.read_csv('processed_dataset.csv')

In [None]:
Utils.count_classes(classes)

[('Engineering', 121),
 ('Information Technology', 104),
 ('Education', 102),
 ('Health & Fitness', 77),
 ('Managment', 74),
 ('Accountant', 67),
 ('Finance', 66),
 ('Advocate', 61),
 ('Sales', 61),
 ('Digital Media', 54),
 ('Designing', 51),
 ('Banking', 48),
 ('Business Development', 44),
 ('Arts', 43),
 ('HR', 41),
 ('Building & Construction', 29),
 ('Automobile', 27),
 ('Consultant', 26),
 ('BPO', 25),
 ('Agricultural', 24),
 ('Food & Beverages', 22),
 ('Apparel', 14),
 ('Public Relations', 13),
 ('Aviation', 13),
 ('Architects', 12)]

In [None]:
def cleanText(text_list):
  """Clean the given text_list(resumes)
      Loop all over the given list and clean each resume.
      Apply regression expressions to match each pattern to replace or to remove
    
  Args:
    text_list: a list of text elements that represents each resume
  Returns:
    list: A cleaned version of text_list

  """
  list = []
  apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will", # apostrophs dictionary
           "'d":" would","'ve":" have","'re":" are"} 


  for text in text_list:
    text = text.replace('\\n', ' ') # remove line return
    text = text.lower() # to lowercase   

    for key,value in apos_dict.items(): # replace words with apostrophs with their synonyms
      if key in text: 
          text=text.replace(key,value)

    text = re.sub('[0-9_]', '', text) # remove numbers
    text = re.sub(r"\b[a-zA-Z]\b", "", text) # numbers single chars
    text = re.sub(r"\b[a-zA-Z][a-zA-Z]\b", "", text) # numbers double chars   
    text = re.sub(r'\bx\w+', '', text) # word starting with x"""
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub('#\S+', '', text)  # remove hashtags
    text = re.sub('@\S+', '  ', text)  # remove mentions 
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)  # remove punctuations
    text = re.sub(r'[^\x00-\x7f]',r' ', text) 
    text = re.sub('\s+', ' ', text)  # remove extra whitespace
    list.append(text)

  return list


In [None]:
# Clean the dataset
print(resumes[0])
clean_resume = cleanText(resumes)
print (clean_resume[0])


b'John H. Smith, P.H.R.\n800-991-5187 | PO Box 1673 | Callahan, FL 32011 | info@greatresumesfast.com\n\nApproachable innovator with a passion for Human Resources.\n\nSENIOR HUMAN RESOURCES PROFESSIONAL\nPersonable, analytical, flexible Senior HR Professional with multifaceted expertise. Seasoned Benefits Administrator with\nextensive experience working with highly paid professionals in client-relationship-based settings. Dynamic team leader\ncapable of analyzing alternatives and identifying tough choices while communicating the total value of benefit and\ncompensation packages to senior level executives and employees.\n\nCORE COMPETENCIES\nBenefits Administration \xe2\x80\x93 Customer Service \xe2\x80\x93 Cost Control \xe2\x80\x93 Recruiting \xe2\x80\x93 Acquisition Management \xe2\x80\x93 Compliance Reporting\nRetention \xe2\x80\x93 Professional Services \xe2\x80\x93 Domestic & International Benefits \xe2\x80\x93 Collaboration \xe2\x80\x93 Adaptability \xe2\x80\x93 Change Management\n

In [None]:

# encode categories into numbers
le = LabelEncoder()
classes = le.fit_transform(classes)

# calculate frequencies tfidf
word_vectorizer = TfidfVectorizer(sublinear_tf=True, max_features=1000)

features = word_vectorizer.fit_transform(resumes)

X_train,X_test,y_train,y_test = train_test_split(features,classes,random_state=0, test_size=0.4)

print('Encodage faite ! bon')


Encodage faite ! bon


In [None]:
def sampled_range(mini, maxi, num):
  """Computes the Euclidian distance between data1 and data2.
  
  Args:
    data1: a list of numbers: the coordinates of the first vector.
    data2: a list of numbers: the coordinates of the second vector (same length as data1).

  Returns:
    The Euclidian distance: sqrt(sum((data1[i]-data2[i])^2)).
  """
  if not num:
      return []
  lmini = math.log(mini)
  lmaxi = math.log(maxi)
  ldelta = (lmaxi - lmini) / (num - 1)
  out = [x for x in set([int(math.exp(lmini + i * ldelta)) for i in range(num)])]
  out.sort()
  return out

In [None]:
def find_best_k(train_x, train_y, valid_x, valid_y):
  """This function is used in the cross-validation 
  in order to find the best K for the given classifier for KNeighborsClassifier.
  It uses the sampled_range function to generate a range of potential values of K to be tested

  Args:
    train_x: a list of resumes.
    train_y: a list of classes(categories).
    valid_x: an other list of resumes (1/n_folds).
    valid_y: an other list of classes(categories) (1/n_folds).

  Returns:
    A pair (bestK, bestP):
    - bestK is an integer which represent the best calculated k for this function
    - bestP is the best precision for bestK
  """

  SR = sampled_range(1,50,20)
  bestP = 0
  bestK = 0
  for r in SR:
    clf = KNeighborsClassifier(n_neighbors=r)
    clf.fit(train_x, train_y)
    prediction = clf.predict(valid_x)

    if(clf.score(valid_x, valid_y) > bestP):
      bestP = clf.score(valid_x, valid_y)
      bestK = r
  return (bestK, bestP)


In [None]:
def find_best_c(train_x, train_y, valid_x, valid_y):
  """This function is used in the cross-validation 
  in order to find the best K for the given classifier for SVM.
  It uses a range of potential values of C to be tested

  Args:
    train_x: a list of resumes.
    train_y: a list of classes(categories).
    valid_x: an other list of resumes (1/n_folds).
    valid_y: an other list of classes(categories) (1/n_folds).

  Returns:
    A pair (bestC, bestP):
    - bestC is an integer which represent the best calculated c for this function
    - bestP is the best precision for bestK
  """
  # exponential growing sequence 
  SR = np.logspace(-3, 3, 10)

  bestP = 0
  bestC = 0
  for r in SR:
    clf = svm.SVC(C=r)
    clf.fit(train_x, train_y)
    prediction = clf.predict(valid_x)

    if(clf.score(valid_x, valid_y) > bestP):
      bestP = clf.score(valid_x, valid_y)
      bestC = r
  return (bestC, bestP)

In [None]:
def cross_validation(train_x, train_y, find_best):
  """Cross-validation to evaluate the given parameter function.

  Args:
    train_x: a list of resumes.
    train_y: a list of classes(categories).
    find_best: a function taking four arguments (train_x, train_y, valid_x, valid_y)

  Returns:
    A pair (BV, BP):
    - BV is the best calculated parameter acrros all folds.
    - BP is the best precision that BV gives.

  """
  kf = KFold()
  kt = []
  pt = []
  k_dict = {}
  for train, test in kf.split(train_x):

    t_x = train_x[train]
    v_x = train_x[test]
    
    t_y = train_y[train]
    v_y = train_y[test]

    (k, p) = find_best(t_x, t_y, v_x, v_y)
    kt.append(k)
    pt.append(p)
  
  d = {}
  cnd = {}
    
  for i in range(len(kt)):
      if (kt[i] not in d):
        d[kt[i]] = pt[i]
        cnd[kt[i]] = 1
      else:
        d[kt[i]] += pt[i]
        cnd[kt[i]] += 1

  for k in d:
      d[k] = d[k] / cnd[k]
  
  mx = max(d, key=d.get)
  return (mx, d[mx]) 



In [None]:
print('Best k')
cross_validation(X_train, y_train, find_best_k)

Best k


(21, 0.5821917808219178)

In [None]:
print('Best c')
cross_validation(X_train, y_train, find_best_c)

Best c


(10.0, 0.589041095890411)

In [None]:
classifier = KNeighborsClassifier(n_neighbors=21)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print('Précision d\'entrainement de l\'algorithme KNeighbors : {:.2f}'.format(classifier.score(X_train, y_train)))
print('Précision de test de l\'algorithme KNeighbors  : {:.2f}'.format(classifier.score(X_test, y_test)))


Précision d'entrainement de l'algorithme KNeighbors : 0.54
Précision de test de l'algorithme KNeighbors  : 0.43


In [None]:
classifier = svm.SVC(C=10)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print('Précision d\'entrainement de l\'algorithm SVM  : {:.2f}'.format(classifier.score(X_train, y_train)))
print('Précision de test de SVM : {:.2f}'.format(classifier.score(X_test, y_test)))



Précision d'entrainement de l'algorithm SVM  : 0.98
Précision de test de SVM : 0.59


In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print('Précision d\'entrainement de l\'algorithm regression logistique    : {:.2f}'.format(classifier.score(X_train, y_train)))
print('Précision de test de l\'algorithm regression logistique  : {:.2f}'.format(classifier.score(X_test, y_test)))



Précision d'entrainement de l'algorithm regression logistique    : 0.79
Précision de test de l'algorithm regression logistique  : 0.56


In [None]:
classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print('Précision d\'entrainement de l\'algorithm arbre de decision: {:.2f}'.format(classifier.score(X_train, y_train)))
print('Précision de test de l\'algorithm arbre de decision : {:.2f}'.format(classifier.score(X_test, y_test)))



Précision d'entrainement de l'algorithm arbre de decision: 0.98
Précision de test de l'algorithm arbre de decision : 0.44
