In [14]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

## Get keywords from job postings

In [15]:
#read the data 
data = pd.read_csv("job_description.csv",index_col = 0) 

In [23]:
#get the first job description
dt = data.iloc[0]

### 1. Tokenization: Breaking the string into individual words or tokens.

In [24]:
#break the string and save as lower
words = dt['Description'].split()
words = [word.lower() for word in words]
words = [word.replace("(", "") for word in words]
words = [word.replace(")", "") for word in words]
words = [word.replace(",", "") for word in words]
words = [word.replace(".", "") for word in words]
words = [word.replace(":", "") for word in words]
words = [word.replace("?", "") for word in words]


### 2. Stopword Removal: Filtering out common words (stopwords) like "the," "and," "is," etc., which often don't carry significant meaning.

### 3. Normalization: Converting all words to lowercase to ensure consistency.

In [25]:
# Download the NLTK stopwords corpus if not already downloaded
nltk.download('stopwords')

# Get the list of English stopwords (which includes determiners and conjunctions)
stop_words = set(stopwords.words('english'))

# Add unnecesary words to the stop_words set
stop_words.update(['and', 'but', 'or', 'so', 'for', '•', 'may', '-','are','them','&','you','we\'re'])


# Remove determiners and conjunctions from the list of words
filtered_words = list(sorted(set([word for word in words if word not in stop_words])))
sets = list(sorted(set(filtered_words)))
# Print the filtered list of words
print(sets)

['101', '2022', '30', 'academy', 'acceptance', 'accessing', 'achieve', 'acquired', 'acquiring', 'across', 'actively', 'addition', 'adds', 'afraid', 'agencies', 'ago', 'alongside', 'alten', 'analyse', 'analysis', 'analyst', 'analysts', 'analytical', 'apart', 'applicants', 'application', 'apply', 'applying', 'approach', 'approaches', 'appropriate', 'artefacts', 'assess', 'assignments', 'autonomy', 'away', 'ba', 'base', 'based', 'baseline', 'bcs', 'becoming', 'behaviours', 'believe', 'benefits', 'best', 'boundaries', 'buddying', 'building', 'business', 'capability', 'career', 'case', 'central', 'certification', 'certifications', 'change', 'clear', 'client', 'clients', 'collaboration', 'collaborative', 'collaboratively', 'colleague', 'colleagues', 'commitment', 'communication', 'communities', 'company', 'competencies', 'comprehension', 'concise', 'confirmed', 'considered', 'consultancies', 'consultancy', 'consulting', 'continued', 'contribute', 'contributing', 'contribution', 'core', 'crea

[nltk_data] Downloading package stopwords to /Users/raul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 4. Stemming or Lemmatization: Reducing words to their root form to capture variations of the same word (e.g., "running" to "run").

In [26]:
filtered_words.remove('acquired')
filtered_words.remove('acquiring')
filtered_words.append('acquire')

filtered_words.remove('analyse')
filtered_words.remove('analysis')
filtered_words.remove('analytical')
filtered_words.append('analyze')

filtered_words.remove('analysts')

filtered_words.remove('applicants')
filtered_words.remove('application')
filtered_words.remove('applying')

filtered_words.remove('approaches')

filtered_words.remove('certifications')

filtered_words.remove('clients')

filtered_words.remove('collaborative')
filtered_words.remove('collaboratively')

filtered_words.remove('colleagues')

filtered_words.remove('consultancies')
filtered_words.remove('consultancy')
filtered_words.remove('consulting')
filtered_words.append('consult')

filtered_words.remove('contributing')
filtered_words.remove('contribution')

filtered_words.remove('creating')
filtered_words.remove('create')

filtered_words.remove('deliver')
filtered_words.remove('deliveries')

filtered_words.remove('developing')
filtered_words.remove('development')

filtered_words.remove('difference')

filtered_words.remove('eliciting')

filtered_words.remove('engagement')

filtered_words.remove('exploring')

filtered_words.remove('focused')
filtered_words.remove('focussed')

filtered_words.remove('improved')

filtered_words.remove('innovating')

filtered_words.remove('learnt')

filtered_words.remove('management')
filtered_words.remove('managing')

filtered_words.remove('needed')
filtered_words.remove('needs')

filtered_words.remove('offerings')

filtered_words.remove('practice')
filtered_words.remove('practices')

filtered_words.remove('project-related')
filtered_words.remove('projects')

filtered_words.remove('requirement')
filtered_words.remove('requirements')

filtered_words.remove('responsibilities')

filtered_words.remove('sectors')

filtered_words.remove('sets')

filtered_words.remove('situations')

filtered_words.remove('skills')

filtered_words.remove('solving')

filtered_words.remove('stakeholders')

filtered_words.remove('success')
filtered_words.remove('successfully')

filtered_words.remove('supported')
filtered_words.remove('supporting')
filtered_words.remove('supportive')

filtered_words.remove('teams')

filtered_words.remove('transformation')

filtered_words.remove('understanding')

filtered_words.remove('undertaking')

filtered_words.remove('working')

filtered_words.remove('written')

In [30]:
print(sorted(set(filtered_words)))

['101', '2022', '30', 'academy', 'acceptance', 'accessing', 'achieve', 'acquire', 'across', 'actively', 'addition', 'adds', 'afraid', 'agencies', 'ago', 'alongside', 'alten', 'analyst', 'analyze', 'apart', 'apply', 'approach', 'appropriate', 'artefacts', 'assess', 'assignments', 'autonomy', 'away', 'ba', 'base', 'based', 'baseline', 'bcs', 'becoming', 'behaviours', 'believe', 'benefits', 'best', 'boundaries', 'buddying', 'building', 'business', 'capability', 'career', 'case', 'central', 'certification', 'change', 'clear', 'client', 'collaboration', 'colleague', 'commitment', 'communication', 'communities', 'company', 'competencies', 'comprehension', 'concise', 'confirmed', 'considered', 'consult', 'continued', 'contribute', 'core', 'creative', 'criteria', 'cycle', 'data', 'date', 'delivery', 'departments', 'description', 'designed', 'detailed', 'determined', 'develop', 'different', 'direction', 'drawn', 'early', 'efficient', 'elicit', 'end-to-end', 'engage', 'ensuring', 'entry', 'envir

### 5. Term Frequency-Inverse Document Frequency (TF-IDF): Calculating the importance of a word in a document relative to a corpus of documents.
-- remember to add additional words from 4

### 6. Named Entity Recognition (NER): Identifying proper nouns, which can be important keywords.

### 7. Part-of-Speech (POS) Tagging: Identifying the grammatical parts of words to filter out irrelevant ones.

### 8. N-grams: Extracting sequences of words, which can capture phrases and context better than individual words alone.

### 9. Statistical Methods: Analyzing word frequencies, co-occurrences, or patterns to identify significant terms.

### 10. Machine Learning Algorithms: Training models to classify and extract important terms based on labeled data.