In [1]:
import re
import string
from collections import Counter
import squarify
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
from spacy.tokenizer import Tokenizer

from bs4 import BeautifulSoup
import html as ihtml

import requests
import sqlite3

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Hackathon/master/data/techsearch.csv')
df = df.drop(df.columns[0], axis=1)

In [3]:
def clean_text(text):
    text = text.replace('\n', ' ')                # remove newline
    text = BeautifulSoup(text, "lxml").get_text() # remove html
    text = text.replace('/', ' ')                 # remove forward slashes
    text = re.sub(r'[^a-zA-Z ^0-9]', '', text)    # letters and numbers only
    text = text.lower()                           # lower case
    text = re.sub(r'(x.[0-9])', '', text)         # remove special characters
    return text

df['description'] = df.apply(lambda x: clean_text(x['description']), axis=1)

df.head()

Unnamed: 0,job_title,company,location,description,counts,city,job,low_salary,high_salary
0,Data Scientist,Pactera,,looking fora data scientis architect who has 8...,1319,San Jose,data scientist,,
1,Data scientist - Global Sales,PayPal,,responsibilities provide business requirement...,1319,San Jose,data scientist,,
2,Data Scientist,Palo Verde Consulting,"Campbell, CA 95008 (Central Campbell area)",job title data scientistlocation campbell ca 9...,1319,San Jose,data scientist,150000.0,210000.0
3,Data Scientist,Spry Health,"Palo Alto, CA",spry healths mission is to build the worlds la...,1319,San Jose,data scientist,100000.0,135000.0
4,Data Scientist (All Levels) - Santa Clara,LeanTaaS,"Santa Clara, CA 95050",help build technology that saves lives were a...,1319,San Jose,data scientist,,


In [4]:
df.shape

(4856, 9)

In [5]:
df['description'][0]

'looking fora data scientis architect who has 8 yrs of exp in data design data modelling data flow analytics in supply chain domain  heres the detail jd  expert programming skills in python r experience in writing code for various machine learning algorithms for classification clustering forecasting regression neural networks and deep learning handson experience with modern enterprise data architectures and data toolsets ex data warehouse data marts data lake 3nf and dimensional models modeling tools profiling tools strong knowledge of supply chain domain preferably in the hitech industry strong problem solving and abstract thinking skills knowledge of data architecture and design patterns and the ability to apply them ability to conceptualize and articulate ideas clearly and concisely excellent communication presentation and interpersonal skills'

In [6]:
nlp = spacy.load("en_core_web_lg")

tokenizer = Tokenizer(nlp.vocab)

In [7]:
STOP_WORDS = nlp.Defaults.stop_words.union(['year'])

In [8]:
# Tokenizer pipe removing stop words and blank words and lemmatizing
tokens = []

for doc in tokenizer.pipe(df['description'], batch_size=500):
    
    doc_tokens = []
    for token in doc:
        if (token.lemma_ not in STOP_WORDS) & (token.text != ' '):
            doc_tokens.append(token.lemma_)

    tokens.append(doc_tokens)

df['tokens'] = tokens
df['tokens'].head()

0    [look, forum, datum, scientis, architect, 8, y...
1    [responsibility, provide, business, requiremen...
2    [job, title, datum, scientistlocation, campbel...
3    [spry, healths, mission, build, world, large, ...
4    [help, build, technology, save, live, fast, gr...
Name: tokens, dtype: object

In [9]:
df['tokens'][0]

['look',
 'forum',
 'datum',
 'scientis',
 'architect',
 '8',
 'yr',
 'exp',
 'datum',
 'design',
 'datum',
 'model',
 'datum',
 'flow',
 'analytics',
 'supply',
 'chain',
 'domain',
 'heres',
 'detail',
 'jd',
 'expert',
 'programme',
 'skill',
 'python',
 'r',
 'experience',
 'write',
 'code',
 'machine',
 'learn',
 'algorithm',
 'classification',
 'cluster',
 'forecast',
 'regression',
 'neural',
 'network',
 'deep',
 'learn',
 'handson',
 'experience',
 'modern',
 'enterprise',
 'datum',
 'architecture',
 'datum',
 'toolset',
 'ex',
 'datum',
 'warehouse',
 'datum',
 'mart',
 'datum',
 'lake',
 '3nf',
 'dimensional',
 'model',
 'model',
 'tool',
 'profile',
 'tool',
 'strong',
 'knowledge',
 'supply',
 'chain',
 'domain',
 'preferably',
 'hitech',
 'industry',
 'strong',
 'problem',
 'solve',
 'abstract',
 'think',
 'skill',
 'knowledge',
 'datum',
 'architecture',
 'design',
 'pattern',
 'ability',
 'apply',
 'ability',
 'conceptualize',
 'articulate',
 'idea',
 'clearly',
 'conci

In [10]:
df.head()

Unnamed: 0,job_title,company,location,description,counts,city,job,low_salary,high_salary,tokens
0,Data Scientist,Pactera,,looking fora data scientis architect who has 8...,1319,San Jose,data scientist,,,"[look, forum, datum, scientis, architect, 8, y..."
1,Data scientist - Global Sales,PayPal,,responsibilities provide business requirement...,1319,San Jose,data scientist,,,"[responsibility, provide, business, requiremen..."
2,Data Scientist,Palo Verde Consulting,"Campbell, CA 95008 (Central Campbell area)",job title data scientistlocation campbell ca 9...,1319,San Jose,data scientist,150000.0,210000.0,"[job, title, datum, scientistlocation, campbel..."
3,Data Scientist,Spry Health,"Palo Alto, CA",spry healths mission is to build the worlds la...,1319,San Jose,data scientist,100000.0,135000.0,"[spry, healths, mission, build, world, large, ..."
4,Data Scientist (All Levels) - Santa Clara,LeanTaaS,"Santa Clara, CA 95050",help build technology that saves lives were a...,1319,San Jose,data scientist,,,"[help, build, technology, save, live, fast, gr..."


In [11]:
tech_terms = ['python', 'r', 'tableau']
set1 = set(df['tokens'][0])
set2 = set(tech_terms)

set3 = set1 & set2
print(set3)

{'r', 'python'}


In [12]:
df['tokens_filtered'] = df.apply(lambda x: list(set(x['tokens']) & set2), axis=1)

In [13]:
df.head()

Unnamed: 0,job_title,company,location,description,counts,city,job,low_salary,high_salary,tokens,tokens_filtered
0,Data Scientist,Pactera,,looking fora data scientis architect who has 8...,1319,San Jose,data scientist,,,"[look, forum, datum, scientis, architect, 8, y...","[r, python]"
1,Data scientist - Global Sales,PayPal,,responsibilities provide business requirement...,1319,San Jose,data scientist,,,"[responsibility, provide, business, requiremen...","[tableau, python]"
2,Data Scientist,Palo Verde Consulting,"Campbell, CA 95008 (Central Campbell area)",job title data scientistlocation campbell ca 9...,1319,San Jose,data scientist,150000.0,210000.0,"[job, title, datum, scientistlocation, campbel...",[python]
3,Data Scientist,Spry Health,"Palo Alto, CA",spry healths mission is to build the worlds la...,1319,San Jose,data scientist,100000.0,135000.0,"[spry, healths, mission, build, world, large, ...",[python]
4,Data Scientist (All Levels) - Santa Clara,LeanTaaS,"Santa Clara, CA 95050",help build technology that saves lives were a...,1319,San Jose,data scientist,,,"[help, build, technology, save, live, fast, gr...",[python]


In [14]:
df.tail()

Unnamed: 0,job_title,company,location,description,counts,city,job,low_salary,high_salary,tokens,tokens_filtered
4851,Senior Full Stack Developer,Revel Systems,"Atlanta, GA",company description revel was founded with th...,135,Atlanta,ios developer,,,"[company, description, revel, found, mission, ...",[python]
4852,Product Manager / Business Analyst,Worry Free Labs,"Atlanta, GA",key responsibilities meeting with business par...,135,Atlanta,ios developer,,,"[key, responsibility, meet, business, partner,...",[]
4853,Account Executive,MacStadium,"Atlanta, GA 30305 (Buckhead area)",overview macstadium is the leading managed mac...,135,Atlanta,ios developer,,,"[overview, macstadium, lead, manage, mac, host...",[]
4854,"Innovation Software Architect I, Full Stack",InComm,"Atlanta, GA 30303 (Five Points area)",overview leveraging deep integrations into ret...,135,Atlanta,ios developer,,,"[overview, leverage, deep, integration, retail...",[python]
4855,UX/UI Designer,OutSystems,"Atlanta, GA 30319",job description lowcode style as the 1 lowcode...,135,Atlanta,ios developer,,,"[job, description, lowcode, style, 1, lowcode,...",[]


In [15]:
# Create a count function
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

In [16]:
# Create a word count dataframe
wc = count(df['tokens_filtered'])
wc.head(10)

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
1,python,1399,1399,1.0,0.575483,0.575483,0.288097
0,r,732,732,2.0,0.301111,0.876594,0.150741
2,tableau,300,300,3.0,0.123406,1.0,0.061779
