In [13]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



In [2]:
df = pd.read_csv('offers_cleaned.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,link,company,function,details,desc,grade,createdAt,updatedAt,lang,contract_type,country,region,departement,ville,JOB_FIELD_1,JOB_FIELD_2
0,0,INGENIEUR ASSURANCE CONCEPTION ELECTRONIQUE (H/F),https://www.airbus.com/careers/search-and-appl...,Airbus,Qualification & Operability,"Toulouse Area, France",APSYS SAS Airbus is a global leader in aeronau...,1,2021-06-18 00:08:19.69+02,2021-06-18 00:08:19.69+02,fr,Unspecified,France [x],Occitanie [x],Haute-Garonne [x],Toulouse [x],Qualification & Operability,[]
1,1,Assistant Ressources Humaines (H/F) - Orléans ...,https://joinus.saint-gobain.com/fr/fra/hr/p/11...,Saint Gobain,,FRA05570 /*/ Assistant Ressources Humaines (H/...,Retour à la liste des offres Référence : FRA05...,1,2021-06-18 00:08:19.63+02,2021-06-18 00:08:19.63+02,fr,CDI,France [x],Centre-Val de Loire [x],Loiret [x],Orléans [x],[],Human Resources
2,2,Ingénieur électronique analogique/puissance (h/f),https://www.airbus.com/careers/search-and-appl...,Airbus,"Elec.Electron.&Electromag,Optics&Optron.","Paris Area, France",Airbus Defence and Space SAS Airbus is a globa...,1,2021-06-18 00:08:19.573+02,2021-06-18 00:08:19.573+02,fr,Unspecified,France,Île-de-France,Paris,Paris,Electronical Engineering,[]
3,3,Ingénieur(e) électronique analogique/puissance...,https://www.airbus.com/careers/search-and-appl...,Airbus,"Elec.Electron.&Electromag,Optics&Optron.","Paris Area, France",Airbus Defence and Space SAS Airbus is a globa...,1,2021-06-18 00:08:19.182+02,2021-06-18 00:08:19.182+02,fr,Unspecified,France,Île-de-France,Paris,Paris,Electronical Engineering,[]
4,4,Enterprise Security Architect (m/f),https://www.airbus.com/careers/search-and-appl...,Airbus,Security,"Toulouse Area, France",AIRBUS SAS Airbus is a global leader in aerona...,1,2021-06-18 00:08:19.044+02,2021-06-18 00:08:19.044+02,en,Unspecified,France [x],Occitanie [x],Haute-Garonne [x],Toulouse [x],Security,[]


In [6]:
df_eng = df.loc[df['lang'] == 'en', :]
df_eng = df_eng.reset_index(drop=True)

In [7]:
df_eng.head()

Unnamed: 0.1,Unnamed: 0,name,link,company,function,details,desc,grade,createdAt,updatedAt,lang,contract_type,country,region,departement,ville,JOB_FIELD_1,JOB_FIELD_2
0,4,Enterprise Security Architect (m/f),https://www.airbus.com/careers/search-and-appl...,Airbus,Security,"Toulouse Area, France",AIRBUS SAS Airbus is a global leader in aerona...,1,2021-06-18 00:08:19.044+02,2021-06-18 00:08:19.044+02,en,Unspecified,France [x],Occitanie [x],Haute-Garonne [x],Toulouse [x],Security,[]
1,6,Knowledge Management Consultant,https://www.airbus.com/careers/search-and-appl...,Airbus,Programme & Project Management,"Toulouse Area, France",AirBusiness Academy SAS Airbus is a global lea...,1,2021-06-18 00:08:18.911+02,2021-06-18 00:08:18.911+02,en,Unspecified,France [x],Occitanie [x],Haute-Garonne [x],Toulouse [x],Programme and Project Management,[]
2,9,HGV Driver,https://joinus.saint-gobain.com/fr/gbr/do/75p/...,Saint Gobain,,GBR03144 /*/ HGV Driver Royaume-Uni South East...,Retour à la liste des offres Référence : GBR03...,1,2021-06-18 00:08:16.984+02,2021-06-18 00:08:16.984+02,en,Unspecified,,,,,[],[]
3,10,Jewson Lerwick HGV Driver,https://joinus.saint-gobain.com/fr/gbr/do/dcp/...,Saint Gobain,,GBR03195 /*/ Jewson Lerwick HGV Driver Royaume...,Retour à la liste des offres Référence : GBR03...,1,2021-06-18 00:08:15.556+02,2021-06-18 00:08:15.556+02,en,Unspecified,,,,,[],[]
4,18,"Senior Software Engineer, Life Sciences",https://joinus.saint-gobain.com/fr/usa/is/p/34...,Saint Gobain,,"586657 /*/ Senior Software Engineer, Life Scie...",Retour à la liste des offres Référence : 58665...,1,2021-06-18 00:08:12.005+02,2021-06-18 00:08:12.005+02,en,CDI,,,,,[],[]


In [8]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [12]:
data = df_eng.desc.values.tolist()
pprint(data[:1])

['AIRBUS SAS Airbus is a global leader in aeronautics, space and related '
 'services. In 2019 it generated revenues of € 70.5 billion and employed a '
 'workforce of around 134,000. Airbus offers the most comprehensive range of '
 'passenger airliners. Airbus is also a European leader providing tanker, '
 'combat, transport and mission aircraft, as well as one of the world’s '
 'leading space companies. In helicopters, Airbus provides the most efficient '
 'civil and military rotorcraft solutions worldwide. Our people work with '
 'passion and determination to make the world a more connected, safer and '
 "smarter place. Taking pride in our work, we draw on each other's expertise "
 'and experience to achieve excellence. Our diversity and teamwork culture '
 'propel us to accomplish the extraordinary - on the ground, in the sky and in '
 'space. Job Description Who we are:Within Airbus Commercial Aircraft, digital '
 'security topics are managed by our Corporate Digital Security offic

In [14]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['airbus', 'sas', 'airbus', 'is', 'global', 'leader', 'in', 'aeronautics', 'space', 'and', 'related', 'services', 'in', 'it', 'generated', 'revenues', 'of', 'billion', 'and', 'employed', 'workforce', 'of', 'around', 'airbus', 'offers', 'the', 'most', 'comprehensive', 'range', 'of', 'passenger', 'airliners', 'airbus', 'is', 'also', 'european', 'leader', 'providing', 'tanker', 'combat', 'transport', 'and', 'mission', 'aircraft', 'as', 'well', 'as', 'one', 'of', 'the', 'world', 'leading', 'space', 'companies', 'in', 'helicopters', 'airbus', 'provides', 'the', 'most', 'efficient', 'civil', 'and', 'military', 'rotorcraft', 'solutions', 'worldwide', 'our', 'people', 'work', 'with', 'passion', 'and', 'determination', 'to', 'make', 'the', 'world', 'more', 'connected', 'safer', 'and', 'smarter', 'place', 'taking', 'pride', 'in', 'our', 'work', 'we', 'draw', 'on', 'each', 'other', 'expertise', 'and', 'experience', 'to', 'achieve', 'excellence', 'our', 'diversity', 'and', 'teamwork', 'culture', 

In [15]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['airbus', 'sas_airbus_is', 'global', 'leader', 'in', 'aeronautics_space', 'and', 'related', 'services', 'in_it_generated', 'revenues', 'of', 'billion', 'and', 'employed_workforce', 'of', 'around', 'airbus', 'offers', 'the', 'most_comprehensive_range', 'of_passenger_airliners', 'airbus', 'is', 'also_european', 'leader', 'providing_tanker_combat_transport', 'and', 'mission_aircraft', 'as', 'well', 'as', 'one', 'of', 'the', 'world', 'leading', 'space_companies', 'in', 'helicopters', 'airbus', 'provides', 'the', 'most_efficient_civil', 'and_military_rotorcraft', 'solutions', 'worldwide', 'our', 'people', 'work', 'with', 'passion', 'and', 'determination', 'to', 'make', 'the', 'world', 'more_connected_safer', 'and', 'smarter_place_taking_pride', 'in', 'our', 'work', 'we', 'draw', 'on', 'each', 'other', 'expertise', 'and', 'experience', 'to', 'achieve_excellence', 'our', 'diversity', 'and', 'teamwork', 'culture_propel', 'us', 'to', 'accomplish', 'the', 'extraordinary', 'on', 'the', 'ground',

In [16]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [17]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['airbus', 'sas_airbus', 'global', 'leader', 'relate', 'service', 'generated_revenue', 'employed_workforce', 'airbus', 'offer', 'airbus', 'also_european', 'leader', 'well', 'world', 'lead', 'space', 'company', 'helicopter', 'airbus', 'provide', 'solution', 'worldwide', 'people', 'work', 'passion', 'determination', 'make', 'world', 'taking_pride', 'work', 'draw', 'expertise', 'experience', 'achieve_excellence', 'diversity', 'teamwork', 'accomplish', 'extraordinary', 'ground', 'sky', 'space', 'job', 'description', 'airbus', 'commercial', 'aircraft', 'digital', 'security', 'topic', 'manage', 'corporate', 'digital', 'security', 'office', 'transnational', 'organisation', 'base', 'france', 'germany', 'spain', 'direct', 'link', 'global', 'infrastructure', 'site', 'india', 'china', 'asia', 'corporate', 'digital', 'security', 'office', 'structure', 'department', 'enterprise', 'security', 'architecture', 'detection_response', 'evaluation', 'test', 'risk', 'vulnerability', 'critical', 'asset', '

In [18]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 13), (15, 1), (16, 1), (17, 1), (18, 2), (19, 1), (20, 2), (21, 1), (22, 1), (23, 3), (24, 8), (25, 2), (26, 1), (27, 1), (28, 2), (29, 2), (30, 1), (31, 1), (32, 1), (33, 1), (34, 2), (35, 1), (36, 1), (37, 1), (38, 6), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 2), (65, 1), (66, 1), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 2), (73, 1), (74, 5), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 6), (81, 1), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 2), (93, 1), (94, 1), (95, 1), (96, 3), (97, 6), (98, 5), (99, 1), (100, 1), (101, 1), (102, 1), (103, 3), (104, 2), (105, 2), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1

In [19]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ability', 2),
  ('able', 1),
  ('access', 1),
  ('accomplish', 1),
  ('achieve_excellence', 1),
  ('act', 1),
  ('actionable', 1),
  ('activity', 2),
  ('actor', 1),
  ('adapt', 1),
  ('advice', 1),
  ('aerospace', 1),
  ('age', 1),
  ('agile', 1),
  ('airbus', 13),
  ('aircraft', 1),
  ('also', 1),
  ('also_european', 1),
  ('application', 2),
  ('applications_irrespective', 1),
  ('apply', 2),
  ('approach', 1),
  ('appropriate', 1),
  ('architect', 3),
  ('architecture', 8),
  ('art', 2),
  ('asia', 1),
  ('assessment', 1),
  ('asset', 2),
  ('associate', 2),
  ('automotive', 1),
  ('awareness', 1),
  ('balance', 1),
  ('balanced', 1),
  ('base', 2),
  ('beneficial', 1),
  ('benefit', 1),
  ('best_practice', 1),
  ('business', 6),
  ('candidate', 1),
  ('capabilities_thank', 1),
  ('career', 1),
  ('certify', 1),
  ('china', 1),
  ('classroom_session', 1),
  ('collaborate', 1),
  ('collaboration', 1),
  ('commercial', 1),
  ('commit', 1),
  ('commitment', 1),
  ('common', 1),
  

In [20]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [21]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.020*"time" + 0.017*"job" + 0.015*"world" + 0.014*"information" + '
  '0.013*"full" + 0.013*"program" + 0.012*"system" + 0.011*"experience" + '
  '0.011*"year" + 0.010*"work"'),
 (1,
  '0.040*"medical" + 0.029*"scientific" + 0.029*"study" + 0.023*"status" + '
  '0.021*"clinical" + 0.015*"external" + 0.014*"research" + 0.014*"plan" + '
  '0.014*"datum" + 0.014*"include"'),
 (2,
  '0.053*"sanofi" + 0.037*"diversity" + 0.037*"inclusion" + 0.024*"experience" '
  '+ 0.023*"people" + 0.022*"equal_opportunity" + 0.022*"provide" + '
  '0.022*"live" + 0.021*"lead" + 0.020*"empower"'),
 (3,
  '0.043*"software" + 0.028*"solution" + 0.028*"development" + '
  '0.025*"security" + 0.020*"experience" + 0.019*"technology" + 0.016*"skill" '
  '+ 0.016*"integration" + 0.015*"sw" + 0.014*"design"'),
 (4,
  '0.032*"insurance" + 0.031*"export_control" + 0.029*"mining" + 0.026*"mine" '
  '+ 0.012*"scrap" + 0.011*"initiative_credibility" + 0.010*"ot" + '
  '0.010*"independence_integrity" + 0.010*"dis

In [22]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.696216164721037

Coherence Score:  0.48657701567152767


In [23]:
# Visualize the topics
#pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis