# 1. Loading and fitting NLP model

#### In this first part, we load a NLP model with word vectors trained on the french internet. A thesaurus which maps compentencies to words describing them the best is then included into the model to correct the word vectors in order to better represent our field which is the labor market, compentencies, jobs and education.

## Thesaurus preprocessing

In [58]:
import pandas as pd

In [59]:
#A thesaurus specific to the labor market and professional skills is uploaded. It will be used to fine-tune the pre-trained Word2Vec model.
thesaurus = pd.read_csv('base_thesaurus_competence.csv', sep=';')[['complabel', 'complem']]

In [60]:
thesaurus['sentence'] = thesaurus['complabel'] + '|' + thesaurus['complem']

In [61]:
thesaurus['sentence'] = thesaurus['sentence'].str.split('|')
thesaurus['sentence'] = thesaurus['sentence'].apply(lambda x: ' '.join(x))

In [62]:
thesaurus_sentence = thesaurus['sentence'].to_list()

### Cleaning functions : caps, punctuations, stopwords

In [63]:
#Function for Lowercase, Remove parenthesis, Remove numbers, Remove punctuation and stopwords

def prep_text(string):
    
    from nltk.corpus import stopwords
    french_stopwords_list = stopwords.words('french')
    
    treated = str(string)
    treated = treated.lower()
    treated = treated.replace(r'\(.*?\)', '')
    treated = treated.replace(r'\[.*?\]', '')
    treated = treated.replace(r'[0-9]+', '')
    treated = treated.replace(r'[,\.:/]+', ' ')
    treated = treated.replace('"', "")
    treated = treated.split(' ')
    treated = [word for word in treated if word not in french_stopwords_list]
    treated = ' '.join(treated)
    
    return treated

In [64]:
thesaurus_sentence = [prep_text(elem) for elem in thesaurus_sentence]
#thesaurus_sentence is a list in which every element is composed of the thesaurus described competences and the words used to describe them.
#Individual words are separated by spaces

## Domain specific knowledge pre-processing

In [65]:
import pandas as pd
training_title = pd.read_excel('training_titles_list.xlsx')
job_title = pd.read_excel('job_title_list.xlsx')

In [67]:
#The diploma file needs to be pre-treated to remove words that are not meaningful => remove master, titre, spe, ...
training_title['intitule_light'] = training_title['formation'].str.replace('MASTERE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('MASTER', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('TITRE PRO', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('TITRE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('LICENCE PRO', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('LICENCE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('GRADE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DUT', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('NIVEAU', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DIPLOME', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BAC PRO', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BTSA', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BTS', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BP', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BTM', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BM', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BREVET PROFESSIONNEL', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DIP', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('OPTION', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('RNCP', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('MC', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DIPLOME DE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DIPLOME D', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DEUST', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('TP', '')

In [68]:
job_title['text_training'] = job_title['Branche'] + ' ' + job_title['Métier'] + ' ' + job_title['Description']

In [69]:
training_title_train = training_title['intitule_light'].to_list()
training_title_train = [prep_text(elem) for elem in training_title_train]

job_title_train = job_title['text_training'].to_list()
job_title_train = [prep_text(elem) for elem in job_title_train]

In [70]:
text_train = thesaurus_sentence + training_title_train + job_title_train

## Fine tuning pre trained Word2Vec embeddings

In [71]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [72]:
#Initialize a model based on the thesaurus => The thesaurus is a file linking a list of compentencies to pertinent words describing them
#A model is initiated with the words present in the thesaurus
thesaurus_sentence = [elem.split(' ') for elem in thesaurus_sentence]

model_tuned = Word2Vec(size=200, min_count=1, sg=0, min_alpha=1)
model_tuned.build_vocab(text_train)
total_examples = model_tuned.corpus_count

In [73]:
#Load pre trained vectors => A model with pre-trained word vectors
#French pre trained vectors available here :  http://fauconnier.github.io/#data
model_pretrained = KeyedVectors.load_word2vec_format("pre-trained embeddings/Word2Vec/frWac_non_lem_no_postag_no_phrase_200_cbow_cut0.bin", binary=True, unicode_errors="ignore")

In [74]:
#Combine pre-trained model to the thesaurus model
model_tuned.build_vocab([list(model_pretrained.vocab.keys())], update=True)
model_tuned.intersect_word2vec_format("pre-trained embeddings/Word2Vec/frWac_non_lem_no_postag_no_phrase_200_cbow_cut0.bin", binary=True, lockf=1.0, unicode_errors='ignore')

In [75]:
#Train the model with the thesaurus
#This is done so that word vector's meaning are corrected to better represent the field of compentencies and labor market
model_tuned.train(text_train, total_examples=total_examples, epochs=model_tuned.iter)

  model_tuned.train(text_train, total_examples=total_examples, epochs=model_tuned.iter)


(7551924, 8414220)

In [76]:
#Save model
model_tuned.save('pre-trained embeddings/Word2Vec/fine-tuned-model/word_embedding_thesaurus.model')

# 2. Matching formations/training titles with categories/job titles

#### In this second part, we compute a vector representation for each formations/training titles and categories/job titles by finding the word vectors corresponding to each words, and then averaging them with each words' TF-IDF scores. A similarity score (cosine) is then computed between each formation/training titles and the categories. The categories with the lowest distances are kept.

In [77]:
import pandas as pd
training_title = pd.read_excel('training_title.xlsx')
job_title = pd.read_excel('jov_title.xlsx')

In [78]:
#The diploma file needs to be pre-treated => remove master, titre, spe, ...
#The diploma file needs to be pre-treated => remove master, titre, spe, ...
training_title['intitule_light'] = training_title['formation'].str.replace('MASTERE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('MASTER', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('TITRE PRO', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('TITRE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('LICENCE PRO', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('LICENCE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('GRADE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DUT', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('NIVEAU', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DIPLOME', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BAC PRO', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BTSA', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BTS', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BP', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BTM', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BM', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('BREVET PROFESSIONNEL', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DIP', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('OPTION', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('RNCP', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('MC', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DIPLOME DE', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DIPLOME D', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('DEUST', '')
training_title['intitule_light'] = training_title['intitule_light'].str.replace('TP', '')

In [79]:
job_title['categories_branche_metier'] = job_title['Branche'] + ' ' + job_title['Métier']
job_title['categories_metier_desc'] = job_title['Métier'] + ' ' + job_title['Description']
job_title['categories_branche_metier_desc'] = job_title['Branche'] + ' ' + job_title['Métier'] + ' ' + job_title['Description']

In [80]:
training_title_data = training_title['intitule_light'].to_list()
training_title_data = [prep_text(elem) for elem in training_title_data]

job_title_metier = job_title['categories_branche_metier'].to_list()
categories_branche_metier = [prep_text(elem) for elem in job_title_metier ]

job_title_metier_desc = job_title['categories_metier_desc'].to_list()
job_title_metier_desc = [prep_text(elem) for elem in job_title_metier_desc]

job_title_branche_metier_desc = job_title['categories_branche_metier_desc'].to_list()
job_title_branche_metier_desc = [prep_text(elem) for elem in job_title]

In [81]:
job_title['key_merge_cat'] = job_title['categories_branche_metier_desc'].apply(lambda x: prep_text(x))
training_title['key_merge_dip'] = training_title['intitule_light'].apply(lambda x: prep_text(x))

In [82]:
#Load pretrained model which contains the word vectors
from gensim.models import KeyedVectors
model_pretrained = KeyedVectors.load('pre-trained embeddings/Word2Vec/fine-tuned-model/word_embedding_thesaurus.model')

In [83]:
class classification():
    
    def __init__(self, categories, data, model):
        self.categories = categories
        self.data = data
        self.model = model
        
        #Instance of a tfidf vectorizer fitted on all the text
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfidf_vectorizer = TfidfVectorizer()
        text_total = self.data + self.categories
        tfidf_vectorizer.fit(text_total)
        self.vocabulary = tfidf_vectorizer.vocabulary_

        #TfIdf vectorization of the data to be classified and the categories
        self.data_tfidf = tfidf_vectorizer.transform(self.data)
        self.categories_tfidf = tfidf_vectorizer.transform(self.categories)

        #Vectorization of the sentences : categories
        #Vectorization : find matching pre-trained embeddings, and average them over the sentences with tfidf weights
        import numpy as np
        categories_average_embedding = np.zeros((len(self.categories), self.model.vector_size))
        nb_sentence = 0
        
        for sentence in self.categories:
            tfidf_score_total = 0
            sentence_vector = np.zeros((1, self.model.vector_size))
            
            for word in sentence.split(' '):
                try:
                    score_tfidf = self.categories_tfidf[nb_sentence,self.vocabulary[word]]
                    word_vector = self.model.wv[word].reshape((1, self.model.vector_size)) * score_tfidf
                    sentence_vector += word_vector
                    tfidf_score_total += score_tfidf
                except KeyError:
                    pass
                
            sentence_vector = sentence_vector / tfidf_score_total
            categories_average_embedding[nb_sentence,:] = sentence_vector
            nb_sentence += 1
            
        self.categories_sentence_vector = categories_average_embedding
        
        
        #Vectorization of the sentences : data
        #Vectorization : find matching pre-trained embeddings, and average them over the sentences with tfidf weights
        data_average_embedding = np.zeros((len(self.data), self.model.vector_size))
        nb_sentence = 0
        
        for sentence in self.data:
            tfidf_score_total = 0
            sentence_vector = np.zeros((1, self.model.vector_size))
            
            for word in sentence.split(' '):
                try:
                    score_tfidf = self.data_tfidf[nb_sentence,self.vocabulary[word]]
                    word_vector = self.model.wv[word].reshape((1, self.model.vector_size)) * score_tfidf
                    sentence_vector += word_vector
                    tfidf_score_total += score_tfidf
                except KeyError:
                    pass
                
            sentence_vector = sentence_vector / tfidf_score_total
            data_average_embedding[nb_sentence,:] = sentence_vector
            nb_sentence += 1
            
        self.data_sentence_vector = data_average_embedding
    
    #A function to find the most similar elements between the data and the categories is defined
    #A distance score is computed for each possible pair => the distance can either be cosine or euclidian
    #Arguments : k_nearest_items = find the k elements the most similar to the categories
    #Arguments : standard_deviation = the matched item must have a distance that is at least smaller than the x times the standard deviation of distances within one category
    def matching(self,k_nearest_items, x_standard_deviation, distance_type):
        if distance_type == 'cosine_similarity':
            from scipy import spatial
            df_final = pd.DataFrame(columns=['categories', 'items', 'distance', 'rank'])
            
        #Iteration over each sentence to be classfied, for each sentence iterated in the data all categories are also iterated
        #In this way, all sentence to classify are compared with all categories            
            for row_cat_nb in range(len(self.categories)):
                df_provisoire = pd.DataFrame(columns=['categories', 'items', 'distance'])
                
                for row_data_nb in range(len(self.data)):
                    distance = spatial.distance.cosine(self.categories_sentence_vector[row_cat_nb,:], self.data_sentence_vector[row_data_nb,:])
                    dic_provisoire = {'categories':self.categories[row_cat_nb], 'items':self.data[row_data_nb], 'distance':distance}
                    df_provisoire = df_provisoire.append(dic_provisoire, ignore_index=True)
                
                df_provisoire_min = df_provisoire.iloc[df_provisoire['distance'].argmin()]

                df_provisoire = df_provisoire[df_provisoire['distance'] < (df_provisoire['distance'].mean() - df_provisoire['distance'].std()* x_standard_deviation)]
                df_provisoire = df_provisoire.append(df_provisoire_min)
                df_provisoire = df_provisoire.drop_duplicates()
                
                df_provisoire = df_provisoire.sort_values(by='distance', ascending=True)
                df_provisoire = df_provisoire.reset_index(drop=True)
                df_provisoire['rank'] = df_provisoire.index.values.astype(int) + 1
                df_provisoire = df_provisoire[df_provisoire['rank'] <= k_nearest_items]
                print(df_provisoire)

                df_final = df_final.append(df_provisoire)

            return df_final

In [84]:
#Initiate an instance of the Classification Class
matching_tu = classification(job_title_branche_metier_desc, training_title_data, model_pretrained)

  sentence_vector = sentence_vector / tfidf_score_total


In [None]:
#Run the matching function
resultats_tu = matching_tu.matching(30,2,'cosine_similarity')

In [54]:
#Merge the results df with the original dataframe to recover the original unchanged titles
resultat_tu_label = resultats_tu.merge(training_title, left_on='items', right_on='key_merge_dip')
resultat_tu_label = resultat_tu_label.merge(job_title, left_on='categories', right_on='key_merge_cat')

In [55]:
#Save results df to an excel file
resultat_tu_label.to_excel('classification_result.xlsx', index=False)