In [1]:
#import modules
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.snowball import FrenchStemmer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from gensim import matutils, models
import scipy.sparse
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import re
import warnings
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
warnings.simplefilter(action='ignore')
%matplotlib inline

stemmer = FrenchStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
stop_fr = nltk.corpus.stopwords.words('french')
stop_uk = nltk.corpus.stopwords.words('english')
stop_spacy_fr = list(fr_stop)

In [2]:
df = pd.read_csv('DATA_FULL.csv')

In [3]:
df['Rating'] = df['Rating'].str.replace(',','.')
df.Rating = df.Rating.astype(float)

In [4]:
def parse_text(text):
    import unidecode
    text = tokenizer.tokenize(text.lower())
    text = [word for word in text if not word in stop_fr]
    text = [word for word in text if not word in stop_uk]
    text = [word for word in text if not word in stop_spacy_fr]
    text = [word for word in text if not word in ['euse']]
    text = [stemmer.stem(word) for word in text]
    text = [re.sub("\d+", "", word) for word in text]
    text = [unidecode.unidecode(word) for word in text]
    # get unique locations
    uniqueLocation=list(df.Location.unique())
    uniqueLocation=[x.lower() for x in uniqueLocation]
    text = [word for word in text if not any(word in s for s in uniqueLocation)]
    return text

def concatExtractedWords(title,summary):
    concatWords=parse_text(title)+parse_text(summary)
    return concatWords

In [5]:
df['BagOfWords'] = df.apply(lambda x: concatExtractedWords(x.Title, x.Description), axis=1)

In [6]:
df['BagOfWords'] = df['BagOfWords'].apply(lambda x: ','.join(map(str, x)))

In [7]:
def prepare_corpus(doc):
    """
    Input  : cleaned documents
    Purpose: create term dictionary of corpus and convert it into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
    cvna = CountVectorizer(tokenizer=parse_text, ngram_range=(
        1, 3), stop_words=stop_fr, strip_accents='ascii', max_df=.8)
    data_cvna = cvna.fit_transform(doc)
    data_dtmna = pd.DataFrame(
        data_cvna.toarray(), columns=cvna.get_feature_names())
    
    # create the term_document matrix
    doc_term_matrix = matutils.Sparse2Corpus(
        scipy.sparse.csr_matrix(data_dtmna.transpose()))
    
    # create the vocabulary dictionary
    dictionary = dict((v, k) for k, v in cvna.vocabulary_.items())

    return dictionary, doc_term_matrix

## LSA features

Using dimensionality reduction techniques to get the most important topics of the corpus via LSA / LDA and NMF.

In [8]:
# that we have the term_document matrix we can run the lsa

import re
def create_gensim_lsa_model(docs, number_of_topics, chunk=2000):
    """
    Input  : cleaned documents, number of topics and number of chunk per topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary, doc_term_matrix = prepare_corpus(docs)
    
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics,
                        id2word=dictionary, chunksize=chunk)
    
    return lsamodel, doc_term_matrix

In [9]:
number_of_topics=5
lsa, doc_term_matrix=create_gensim_lsa_model(df['Title'],number_of_topics)

In [10]:
# look at which topics each row contains
corpus_transformed = lsa[doc_term_matrix]

In [11]:
# transform the result into numpy array to get the score for each title 
all_topics_csr = matutils.corpus2csc(corpus_transformed)
all_topics_numpy = all_topics_csr.T.toarray()

In [12]:
Lsa_Topic = pd.DataFrame(all_topics_numpy, index=df.index, columns=[
                         'lsa1', 'lsa2', 'lsa3', 'lsa4', 'lsa5'])
Lsa_Topic.head(2)
# our brand new features

Unnamed: 0,lsa1,lsa2,lsa3,lsa4,lsa5
0,0.046282,0.027923,0.0794,0.019654,0.323825
1,0.843184,-0.155558,0.341612,-0.091903,-0.18289


## LDA features

In [13]:
def create_gensim_lda_model(docs, number_of_topics, passe=20, iters=100, chunk=2000):
    """
    Input  : clean document, number of topics and number of iteration, chunks, passes to use 
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary, doc_term_matrix = prepare_corpus(docs)
    # generate LDA model
    ldamodel = models.LdaModel(corpus=doc_term_matrix, id2word=dictionary, num_topics=number_of_topics,
                               iterations=iters, passes=passe, chunksize=chunk, random_state=1)
    
    return ldamodel, doc_term_matrix

In [14]:
num_of_topics = 5
lda, doc_term_matrixx = create_gensim_lda_model(
    df['Title'], num_of_topics)
lda.print_topics()

[(0,
  '0.024*"projet" + 0.019*"chef" + 0.019*"chef projet" + 0.016*"analyst" + 0.013*"stag" + 0.012*"ingenieur" + 0.011*"business" + 0.011*"dat" + 0.010*"developp" + 0.009*"consult"'),
 (1,
  '0.037*"architect" + 0.029*"dat" + 0.012*"devop" + 0.011*"manag" + 0.011*"consult" + 0.008*"solut" + 0.007*"expert" + 0.007*"lead" + 0.006*"freelanc" + 0.006*"scienc"'),
 (2,
  '0.039*"dat" + 0.023*"scientist" + 0.022*"dat scientist" + 0.020*"consult" + 0.017*"developpeur" + 0.017*"engine" + 0.014*"end" + 0.013*"manag" + 0.010*"senior" + 0.008*"learning"'),
 (3,
  '0.059*"dat" + 0.050*"ingenieur" + 0.047*"engine" + 0.034*"devop" + 0.023*"dat engine" + 0.017*"analyst" + 0.014*"consult" + 0.013*"dat analyst" + 0.013*"senior" + 0.012*"softwar"'),
 (4,
  '0.065*"developpeur" + 0.020*"web" + 0.015*"full" + 0.015*"stack" + 0.015*"full stack" + 0.014*"jav" + 0.011*"developpeur web" + 0.011*"engine" + 0.010*"ingenieur" + 0.008*"developpeur jav"')]

In [15]:
corpus_transformedd = lda[doc_term_matrixx]

In [16]:
all_topics_cs = matutils.corpus2csc(corpus_transformedd)
all_topics_np = all_topics_cs.T.toarray()

In [17]:
Lda_Topic = pd.DataFrame(all_topics_np, index=df.index, columns=[
                         'lda1', 'lda2', 'lda3', 'lda4', 'lda5'])

Lda_Topic.head(2)
# our brand new features

Unnamed: 0,lda1,lda2,lda3,lda4,lda5
0,0.265784,0.197594,0.020155,0.020164,0.496304
1,0.050232,0.051122,0.797878,0.050767,0.050001


In [18]:
df = pd.concat([df, Lsa_Topic, Lda_Topic], axis=1)
df.head(2)

Unnamed: 0,Title,Location,Date,Company,Rating,Count,Contract,Description,min_salary,max_salary,avg_salary,Dept,title_words,summary_words,contractWords,CDI,CDD,Freelance,Temps partiel,Temps plein,Unknown,lat-long,lat,long,lat*long,Regions,BagOfWords,lsa1,lsa2,lsa3,lsa4,lsa5,lda1,lda2,lda3,lda4,lda5
0,Business Developer – Expert sécurité,Arras,2020-05-11,Uptoo,4.5,32,Non renseigné,À PROPOS :Bienvenue dans notre PME spécialiste...,27000.0,55000.0,41000.0,62.0,business developer expert sécurité,"['propos', 'bienvenue', 'pme', 'spécialiste', ...",['nan'],0.0,0.0,0.0,0.0,0.0,1.0,"(50.291048, 2.7772211)",50.291048,2.777221,139.66936,Île-de-France,"business,develop,expert,secur,propos,bienvenu,...",0.046282,0.027923,0.0794,0.019654,0.323825,0.265784,0.197594,0.020155,0.020164,0.496304
1,Manager Big Data H/F,Paris,2020-04-29,Elitegroup Recruitment,3.0,0,"Temps plein, CDI",Présentation de l'entreprise:Notre client est ...,55000.0,75000.0,65000.0,75.0,manager big data,"['présentation', 'entreprise', 'client', 'cabi...","['temps', 'plein', 'cdi']",1.0,0.0,0.0,0.0,1.0,0.0,"(48.8566969, 2.3514616)",48.856697,2.351462,114.884647,Île-de-France,"manag,dat,present,entrepris,client,cabinet,con...",0.843184,-0.155558,0.341612,-0.091903,-0.18289,0.050232,0.051122,0.797878,0.050767,0.050001


## NMF 

For NMF, we need to obtain a design matrix. To improve results, I am going to apply TfIdf transformation to the counts

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF  # unsupervised to find topics

tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                   min_df=0.05, tokenizer=parse_text, strip_accents='ascii')

tfidf = tfidf_vectorizer.fit_transform(df['Description'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

nmf = NMF(n_components=5, random_state=11,
              alpha=.1, l1_ratio=.5, init='nndsvd')

# Weights for documents relative to topics
W = nmf.fit_transform(tfidf)
# term weights : Weights for terms relative to topics
H = nmf.components_

In [20]:
W = pd.DataFrame(W, columns=['nmf1','nmf2','nmf3','nmf4','nmf5'])
df = pd.concat([df, W], axis=1)

In [21]:
df_nan = df[df['avg_salary'].isna()]
df_salary = df[~df.isin(df_nan)].dropna(how='all')
df_salary.shape

(777, 42)

In [22]:
df_salary.to_csv('ML_RDY_SALARY.csv', index=False)
df_nan.to_csv('ML_RDY_SALARY.csv', index=False)