# Load data

In [1]:
import pandas as pd
import re
import string
# from nltk.corpus import stopwords

import spacy
import en_core_web_sm
from collections import Counter
from spacy.tokens import Doc

In [2]:
nlp = en_core_web_sm.load()

In [3]:
file_path = "../data/indeed_data engineer_2021040519.txt"
job_data = []

with open(file_path, 'r') as f:
    for line in f.readlines():
        line = line.strip().split(',')
        job_data.append(line)

In [5]:
job_data = pd.DataFrame(job_data[1:], columns=job_data[0])

In [62]:
class CustomTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        
    def __call__(self, text):
        words = text.split(" ")
        return Doc(self.vocab, words=words)
    
    @staticmethod
    def generate_ngram(tokens, n):
        ngrams = zip(*[tokens[i:] for i in range(n)])
        return [" ".join(gram) for gram in ngrams]
        


In [74]:
# stop_words = stopwords.words['english']
user_define_stop_words = ['experience', 'work', 'job']

def is_allowed_token(token):
    return token.text.strip() != '' and not token.is_stop and token.lemma_ not in user_define_stop_words

def generate_chunk_token(doc):
    for noun_phrase in list(doc.noun_chunks):
        noun_phrase.merge(noun_phrase.root.tag_, noun_phrase.root.lemma_, noun_phrase.root.ent_type_)
 
def preprocess_text(text):
    """
    clean and tokenize text
    :param text: str
    :return: a list of tokens
    """
    # replace \s with ' '
    text = re.sub('\s+', ' ', text)
    
    # replace punctuation with ' '
    text = re.sub(f'[{string.punctuation}]+', ' ', text)
    
    text = text.lower()

    doc = nlp(text)
    
    # really bad, some chunks are messy 
    # such as  data integration metadata management and business intelligence performance,
    # generate_chunk_token(doc)
    
    tokens = [token for token in doc if is_allowed_token(token)]

    return tokens


In [69]:
tokens = preprocess_text("""
Data Engineer,Schedule: Mon- Fri Overview: The data engineer structures data for analytical access performance and integration. The engineer must manage the balance of current and future needs in both design and content. This position designs and constructs enterprise-wide data and information architecture to support business intelligence and data integration management efforts. You will engineer and implement data warehouse and data mart concepts and design and oversee strategies for data integration metadata management and business intelligence performance. You will partner with the business and leadership to prioritize data and information requirements and will help determine strategic direction of data asset management efforts. Responsibilities: Works with analytics information manager to develop a practical roadmap for analytic resource(s) and reporting platform(s). Assist in defining the overall data architecture including ELT processes logical & physical data models and data mart designs.Partner with leadership to understand and prioritize data and information requirements and determine strategic direction for integration and data management of analytical data resources.Collaborate with key business users to identify needs and opportunities for improved delivery.Develops an understanding of the underlying corporate data sources and their relationship to support credit union data driven decision objectives.Maintain knowledge of software tools and programming languages that effectively support the environment.Lead continuous improvement efforts to enhance performance and provide increased functionality. Provide technical guidance to projects and teams ensuring that new initiatives enable effective analytic information management. Develop and oversee metadata repository business intelligence (BI) and data warehouse strategies and development.Engineer develop and implement data warehouse and data mart concepts.Provide technical knowledge and business support to BI teams.Lead data management efforts for enterprise projects through all phases of development life cycle.Be able to pilot and build POC’s to demonstrate and test business value Should have a strong understanding of relational and dimensional modeling to facilitate data wrangling code operationalization and exploration of data. Educate and provide data solutions to varied levels and use cases Develop presentations and trainings for business users to increase understanding of data resources and effective use casesContribute to and maintain business glossary and other resources that foster cross-functional collaboration on analytical data resources Cross-train support and work closely with team members in the following roles Data Scientist: data wrangling feature engineering model development management and operationalization. BI Analyst & Developer: visualizations report building analysis of data for consulting purposes and data mart development. Database Administration: database security redundancy and backup source system ETL Qualifications: QUALIFICATIONS: Must be able to manage multiple tasks simultaneously and react to problems quickly.Must have prior experience developing analytical solutions in large or midsize companies.Must be able to translate concepts and directions into practical solutions.Ability to handle confidential and sensitive data and meet critical deadlinesExcellent understanding of SQL PL/SQL and ELT implementation.Experience with multiple relational database platforms Oracle and SQL Server desired.Programming experience with 4th generation and 3rd generation computer languages such as Python and R desiredMust be able to develop maintain review and explain data models.Must have excellent verbal and written communication skills.Must be a team playerUnderstanding of the financial services industry desiredExperience with dashboard design and delivery desired EXPERIENCE: Four years of progressive work experience in business intelligence data warehousing analytics or data engineering.Experience working with end users to gather requirements and build technical solutions from concept to implementation. EDUCATION: Bachelor’s degree in information technology computer science or other business related field. Equivalent experience may substitute for education qualifications.

""")


['data engineer',
 'engineer schedule',
 'schedule mon',
 'mon fri',
 'fri overview']

In [9]:
def calculate_token_df(token, docs):
    """
    calculate the Document Frequency (occurance in all docs) of a token
    :param token: 
    :param docs: [doc1, doc2, ..., doct], where doct is a set of tokens
    :return: Document Frequency
    """
    token_df = 0
    for token_set in docs:
        if token in token_set:
            token_df += 1
    return token_df
            

In [75]:
job_data["tokens"] = job_data["job_description"].apply(preprocess_text)


In [89]:
docs = []
all_unique_tokens = set()
for tokens in job_data["tokens"].tolist():
    tokens = [token.text for token in tokens]
    tokens = CustomTokenizer.generate_ngram(tokens, n=2)

    token_set = set(tokens)
    docs.append(token_set)
    all_unique_tokens.update(token_set)

In [90]:
df_dict = dict.fromkeys(all_unique_tokens, 0)
for token in all_unique_tokens:
    df_dict[token] = calculate_token_df(token, docs)

In [91]:
sorted(df_dict.items(), key=lambda item: -item[1])[:20]


[('5 years', 9),
 ('remote interview', 6),
 ('interview process', 6),
 ('data pipelines', 6),
 ('fully remote', 6),
 ('big data', 5),
 ('monday friday', 5),
 ('computer science', 5),
 ('data engineer', 5),
 ('bachelor degree', 5),
 ('bachelor s', 5),
 ('data sources', 4),
 ('paid time', 4),
 ('sql server', 4),
 ('data engineering', 4),
 ('knowledge data', 4),
 ('python r', 4),
 ('business intelligence', 4),
 ('data warehousing', 4),
 ('years required', 4)]