### ---------- LinkedIn Analyzer Notebook ----------

This notebook will help to analyze the job offers scraped.

In [1]:
import nltk
import pandas as pd
import re
import langcodes
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from datetime import date
from dateutil.relativedelta import relativedelta
from langdetect import detect, DetectorFactory


from tqdm import tqdm
import os

In [3]:
class JobOffer():
    def __init__(self, job_description, language, offer_ID):
        self.description = job_description
        self.offer_ID = offer_ID
        self.language = language
        self._analyze()
        self._finalize_report()

    def _analyze(self):
        self.df_analysis = pd.DataFrame(columns = ['Word', 'Count', 'Category'])

        # Lowercase the characters
        text = re.sub(r"[^a-zA-Z0-9]", " ", self.description.lower())
        words = text.split()

        # Because stopwords is dependant of the language, we should use the language as a parameter
        words = [w for w in words if w not in stopwords.words(self.language)]
        freq_words = nltk.FreqDist(words)
        for key in freq_words.keys():
            self.df_analysis = self.df_analysis.append({'Word' : key, 'Count': freq_words[key], 'Category': 'word'}, ignore_index = True)

        # Reduce words to their stem
        stemmed = [SnowballStemmer(language = self.language).stem(w) for w in words]
        freq_stemmed = nltk.FreqDist(stemmed)
        for key in freq_stemmed.keys():
            self.df_analysis = self.df_analysis.append({'Word' : key, 'Count': freq_stemmed[key], 'Category': 'stem'}, ignore_index = True)
        
        # Reduce words to their lem
        lemmed  = [WordNetLemmatizer().lemmatize(w) for w in words]
        freq_lemmed = nltk.FreqDist(lemmed)
        for key in freq_lemmed.keys():
            self.df_analysis = self.df_analysis.append({'Word' : key, 'Count': freq_lemmed[key], 'Category': 'lem'}, ignore_index = True)
        
        finder = nltk.collocations.BigramCollocationFinder.from_words(lemmed)
        for key in finder.ngram_fd.keys():
            new_key = key[0] + ' ' + key[1]
            self.df_analysis = self.df_analysis.append({'Word' : new_key, 'Count': finder.ngram_fd[key], 'Category': 'bigram'}, ignore_index = True)
        # dfBigrams['bigram'] = finder.ngram_fd.keys()
        # dfBigrams['count']  = finder.ngram_fd.values()

    def _finalize_report(self):
        self.df_analysis['offerID']  = self.offer_ID

In [4]:
class PoolOffers():
    def __init__(self, filename):
        self.df      = pd.read_excel(filename)
        self._generate_pool()

    def _generate_pool(self):
        self.table_analysis = []

        self.df['Language'] = self.df.apply(lambda row: self._find_offer_language(row['Job_Description']), axis = 1)
        self.df['Estimated_post_date'] = self.df.apply(lambda row: self._calculate_date(row['Posted_Date']), axis = 1)
        self.df.drop(columns = 'Posted_Date', inplace = True)

        for index, row in tqdm(self.df.iterrows()):
            self.table_analysis.append(JobOffer(row['Job_Description'], row['Language'], index))

    def _calculate_date(self, text_date):
        if 'month' in text_date:
            final_date = date.today() - relativedelta(months  = int(text_date.split(' ')[0]))
        elif 'week' in text_date:
            final_date = date.today() - relativedelta(days    = int(text_date.split(' ')[0])*7)
        elif 'day' in text_date:
            final_date = date.today() - relativedelta(days    = int(text_date.split(' ')[0]))
        elif 'hour' in text_date:
            final_date = date.today() - relativedelta(hours   = int(text_date.split(' ')[0]))
        elif ('min' and 'ago') in text_date:
            final_date = date.today() - relativedelta(minutes = int(text_date.split(' ')[0]))
        else:
            final_date = 'Error'
        
        return final_date

    def _find_offer_language(self, description):
        # Identify the language
        
        DetectorFactory.seed = 0
        self.language = langcodes.Language.make(language = detect(description)).display_name().lower()
        if self.language == "afrikaans" or self.language == "german":
            self.language = "dutch"
        elif self.language == "catalan":
            self.language = "french"

        return self.language

    def analyze(self):
        self.report = pd.DataFrame()

        for offer in self.table_analysis:
            self.report = pd.concat([self.report, offer.df_analysis], axis = 0)

        return self.df, self.report

In [7]:
file = os.path.join('..', '2. Exports', '1. Export scraper', 'teeest.xlsx')
myPool = PoolOffers(file)

TypeError: expected string or bytes-like object

In [None]:
table_offers, table_analysis = myPool.analyze()
print(table_offers.head(2))
print(table_analysis.head(2))

In [None]:
table_analysis.to_excel(os.path.join('..', '2. Exports', 'table_analysis.xlsx'))
table_offers.to_excel(os.path.join('..', '2. Exports', 'table_offers.xlsx'))

In [None]:
df_job_offers = pd.read_excel("All search data consultant.xlsx")

In [None]:
df_job_offers.head()

In [None]:
df_corpus = df_job_offers["Job_Description"]

In [None]:
list_corpus = df_corpus.to_list()

In [None]:
en_offers = []
nl_offers = []

for text in list_corpus:
    if "Je bent" in text or "Belgische" in text or "Kennis" in text or "jij" in text:
        nl_offers.append(text)
    else:
        en_offers.append(text)

In [None]:
print("Number of offers in Dutch:", len(nl_offers))
print("Number of offers in English:", len(en_offers))

In [None]:
for off in nl_offers[:10]:
    print(off[:50])

In [None]:
# Creating a dictionary allows me to put the language in front of the list for the following steps that depends of the languages
total_offers = {'english': en_offers, 'dutch': nl_offers}

In [None]:
words = {}
stemmed = {}
lemmed = {}

for offer in total_offers:

    texts = ""

    for i, _ in enumerate(total_offers[offer]):
        text = total_offers[offer][i]
        text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
        texts = texts + " " + text
        # print(words[:4])

        # Because stopwords is dependant of the language, I use the key of the dictionary
    
    words[offer] = texts.split()
    words[offer] = [w for w in words[offer] if w not in stopwords.words(offer)]

    # Reduce words to their stems
    stemmed[offer] = [SnowballStemmer(language=offer).stem(w) for w in words[offer]]
    
    # Reduce words to their root form
    lemmed[offer]  = [WordNetLemmatizer().lemmatize(w) for w in words[offer]]

    print(words[offer][:5])

In [None]:
print(words['dutch'][:15])
print(words['english'][:15])

## III. Analysis

Analyzing the frequencies of words

In [None]:
for language in words:
    columns = ['lemmed_words', 'lemmed_count', 'stemmed_words', 'stemmed_count', 'words', 'words_count']
    df = pd.DataFrame(columns = columns)

    freq = nltk.FreqDist(words[language])
    df['words'] = freq.keys()
    df['words_count'] = freq.values()
    df.fillna(0, inplace=True)

    freq = nltk.FreqDist(lemmed[language])
    df['lemmed_words'] = pd.Series(freq.keys())
    df['lemmed_count'] = pd.Series(freq.values())

    freq = nltk.FreqDist(stemmed[language])
    df['stemmed_words'] = pd.Series(freq.keys())
    df['stemmed_count'] = pd.Series(freq.values())


    columns = ['bigram', 'count']
    dfBigrams = pd.DataFrame(columns = columns)
    
    finder = nltk.collocations.BigramCollocationFinder.from_words(words[language])
    dfBigrams['bigram'] = finder.ngram_fd.keys()
    dfBigrams['count']  = finder.ngram_fd.values()

    df.to_excel("df_" + language + ".xlsx")
    dfBigrams.to_excel("dfBigrams_" + language + ".xlsx")

In [None]:
finder = nltk.collocations.BigramCollocationFinder.from_words(words['english'])
finder.ngram_fd.most_common(10)