In [1]:
import pandas as pd
from flashtext import KeywordProcessor
import spacy
from spacy.matcher import PhraseMatcher
import sys
import re
#progress bar packages
from tqdm import tqdm
#ngram package
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
import nltk

import csv

pd.set_option('display.max_columns', None)

#for downloading spacy stuff
#!{sys.executable} -m spacy download en

#en is the large model
nlp = spacy.load("en_core_web_sm")

In [2]:
df_sentences = pd.read_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/sentences/sentences_emscad.csv")
df_skills = pd.read_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Skills/esco.csv")

In [3]:
#Creating all the possible grams for each sentence
allgrams = []

#This tokenizer immediately removes punctuation and special characters from the sentence
#tokenizer = RegexpTokenizer()

#Run this code three times, once for the regular sentence, then the lemmatized sentence, finally the sentence without
#stopwords

#,sentence,sentence_lemmatized,sentence_no_stopwords

for sentence in tqdm(df_sentences['sentence']):
    tokenizedsentence = word_tokenize(str(sentence))
    
    #Getting all possible n grams of the sentence
    #for n in range(1,len(tokenizedsentence)+1):
    
    #getting up to tri grams for each sentence
    for n in range(1,4):
        grams = ngrams(tokenizedsentence,n)
        for gram in grams:
            allgrams.append(str(gram))

allgrams = pd.DataFrame(allgrams)
allgrams = allgrams.rename(columns={0:'allgrams'})
print(allgrams.shape[0])

#Initializing the keyword processor
keyword_processor = KeywordProcessor(case_sensitive=False)

#label,label_lemmatized,no_stopwords
#Adding all the skills to the processor
for skill in df_skills['label']:
    try:
        keyword_processor.add_keyword(skill)
    except:
        continue

def searcher(row):
    #check if the words are in the row and return a True or False instead of the actual word
    boolean = bool(keyword_processor.extract_keywords(row))
    return boolean

tqdm.pandas()
allgrams['contains_skill'] = allgrams['allgrams'].progress_apply(searcher)

#Only selecting the ngrams which contain a skill
allgrams = allgrams[allgrams.contains_skill == True]
#cleaning up
allgrams['allgrams'] = allgrams['allgrams'].astype(str)
allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')
allgrams['allgrams'] = allgrams['allgrams'].str.strip()

allgrams = allgrams.drop(columns=['contains_skill'])

#Some softskills match multiple times, therefore removing the duplicates
allgrams = allgrams.drop_duplicates().reset_index(drop=True)

#removing any nans
allgrams.dropna(subset=['allgrams'],inplace=True)

allgrams.to_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/n-grams/emscad_trigrams_ESCO.csv", quoting=csv.QUOTE_NONNUMERIC, index=False)


100%|██████████| 302284/302284 [01:00<00:00, 5021.55it/s]


12753751


100%|██████████| 12753751/12753751 [01:37<00:00, 131167.36it/s]
  allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')


In [4]:
#Creating all the possible grams for each sentence
allgrams = []

#This tokenizer immediately removes punctuation and special characters from the sentence
#okenizer = RegexpTokenizer()

#Run this code three times, once for the regular sentence, then the lemmatized sentence, finally the sentence without
#stopwords

#,sentence,sentence_lemmatized,sentence_no_stopwords

for sentence in tqdm(df_sentences['sentence_lemmatized']):
    tokenizedsentence = word_tokenize(str(sentence))
    
    #Getting all possible n grams of the sentence
    #for n in range(1,len(tokenizedsentence)+1):
    
    #getting up to tri grams for each sentence
    for n in range(1,4):
        grams = ngrams(tokenizedsentence,n)
        for gram in grams:
            allgrams.append(str(gram))

allgrams = pd.DataFrame(allgrams)
allgrams = allgrams.rename(columns={0:'allgrams'})
print(allgrams.shape[0])

#Initializing the keyword processor
keyword_processor = KeywordProcessor(case_sensitive=False)

#label,label_lemmatized,no_stopwords
#Adding all the skills to the processor
for skill in df_skills['label_lemmatized']:
    try:
        keyword_processor.add_keyword(skill)
    except:
        continue

def searcher(row):
    #check if the words are in the row and return a True or False instead of the actual word
    boolean = bool(keyword_processor.extract_keywords(row))
    return boolean

tqdm.pandas()
allgrams['contains_skill'] = allgrams['allgrams'].progress_apply(searcher)

#Only selecting the ngrams which contain a skill
allgrams = allgrams[allgrams.contains_skill == True]
#cleaning up
allgrams['allgrams'] = allgrams['allgrams'].astype(str)
allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')
allgrams['allgrams'] = allgrams['allgrams'].str.strip()

allgrams = allgrams.drop(columns=['contains_skill'])

#Some softskills match multiple times, therefore removing the duplicates
allgrams = allgrams.drop_duplicates().reset_index(drop=True)

#removing any nans
allgrams.dropna(subset=['allgrams'],inplace=True)

allgrams.to_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/n-grams/emscad_trigrams_lemmatized_ESCO.csv", quoting=csv.QUOTE_NONNUMERIC, index=False)

#-----------------------------------------------

100%|██████████| 302284/302284 [00:55<00:00, 5468.13it/s]


12757438


100%|██████████| 12757438/12757438 [01:36<00:00, 132421.06it/s]
  allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')


In [5]:

#Creating all the possible grams for each sentence
allgrams = []

#This tokenizer immediately removes punctuation and special characters from the sentence
#tokenizer = RegexpTokenizer()

#Run this code three times, once for the regular sentence, then the lemmatized sentence, finally the sentence without
#stopwords

#,sentence,sentence_lemmatized,sentence_no_stopwords

for sentence in tqdm(df_sentences['sentence_no_stopwords']):
    tokenizedsentence = word_tokenize(str(sentence))
    #tokenizedsentence = nltk.word_tokenize(str(sentence))
    
    #Getting all possible n grams of the sentence
    #for n in range(1,len(tokenizedsentence)+1):
    
    #getting up to tri grams for each sentence
    for n in range(1,4):
        grams = ngrams(tokenizedsentence,n)
        for gram in grams:
            allgrams.append(str(gram))

allgrams = pd.DataFrame(allgrams)
allgrams = allgrams.rename(columns={0:'allgrams'})
print(allgrams.shape[0])

#Initializing the keyword processor
keyword_processor = KeywordProcessor(case_sensitive=False)

#label,label_lemmatized,no_stopwords
#Adding all the skills to the processor
for skill in df_skills['no_stopwords']:
    try:
        keyword_processor.add_keyword(skill)
    except:
        continue

def searcher(row):
    #check if the words are in the row and return a True or False instead of the actual word
    boolean = bool(keyword_processor.extract_keywords(row))
    return boolean

tqdm.pandas()
allgrams['contains_skill'] = allgrams['allgrams'].progress_apply(searcher)

#Only selecting the ngrams which contain a skill
allgrams = allgrams[allgrams.contains_skill == True]
#cleaning up
allgrams['allgrams'] = allgrams['allgrams'].astype(str)
allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')
allgrams['allgrams'] = allgrams['allgrams'].str.strip()

allgrams = allgrams.drop(columns=['contains_skill'])

#Some softskills match multiple times, therefore removing the duplicates
allgrams = allgrams.drop_duplicates().reset_index(drop=True)

#removing any nans
allgrams.dropna(subset=['allgrams'],inplace=True)

allgrams.to_csv("/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/n-grams/emscad_trigrams_nostopwords_ESCO.csv", quoting=csv.QUOTE_NONNUMERIC, index=False)

100%|██████████| 302284/302284 [00:45<00:00, 6604.10it/s]


7607929


100%|██████████| 7607929/7607929 [01:01<00:00, 124701.36it/s]
  allgrams['allgrams'] = allgrams['allgrams'].str.replace(r"[(),.']", '')
