In [1]:
import re
import string
from collections import Counter
import squarify
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
from spacy.tokenizer import Tokenizer

from bs4 import BeautifulSoup
import html as ihtml

import requests
import sqlite3

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Hackathon/master/data/techsearch.csv')
df = df.drop(df.columns[0], axis=1)

In [3]:
def clean_text(text):
    text = text.replace('\n', ' ')                # remove newline
    text = BeautifulSoup(text, "lxml").get_text() # remove html
    text = text.replace('/', ' ')                 # remove forward slashes
    text = re.sub(r'[^a-zA-Z ^0-9]', '', text)    # letters and numbers only
    text = text.lower()                           # lower case
    text = re.sub(r'(x.[0-9])', '', text)         # remove special characters
    return text

df['description'] = df.apply(lambda x: clean_text(x['description']), axis=1)

In [4]:
nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)
STOP_WORDS = nlp.Defaults.stop_words.union(['year'])

In [5]:
# Tokenizer pipe removing stop words and blank words and lemmatizing
tokens = []

for doc in tokenizer.pipe(df['description'], batch_size=500):
    
    doc_tokens = []
    for token in doc:
        if (token.lemma_ not in STOP_WORDS) & (token.text != ' '):
            doc_tokens.append(token.lemma_)

    tokens.append(doc_tokens)

df['tokens'] = tokens

In [6]:
df.head()

Unnamed: 0,job_title,company,location,description,counts,city,job,low_salary,high_salary,tokens
0,Data Scientist,Pactera,,looking fora data scientis architect who has 8...,1319,San Jose,data scientist,,,"[look, forum, datum, scientis, architect, 8, y..."
1,Data scientist - Global Sales,PayPal,,responsibilities provide business requirement...,1319,San Jose,data scientist,,,"[responsibility, provide, business, requiremen..."
2,Data Scientist,Palo Verde Consulting,"Campbell, CA 95008 (Central Campbell area)",job title data scientistlocation campbell ca 9...,1319,San Jose,data scientist,150000.0,210000.0,"[job, title, datum, scientistlocation, campbel..."
3,Data Scientist,Spry Health,"Palo Alto, CA",spry healths mission is to build the worlds la...,1319,San Jose,data scientist,100000.0,135000.0,"[spry, healths, mission, build, world, large, ..."
4,Data Scientist (All Levels) - Santa Clara,LeanTaaS,"Santa Clara, CA 95050",help build technology that saves lives were a...,1319,San Jose,data scientist,,,"[help, build, technology, save, live, fast, gr..."


In [7]:
tech_terms = ['python', 'r', 'sql', 'hadoop', 'spark', 'java', 'sas', 'tableau',
              'hive', 'scala', 'aws', 'c', 'c++', 'matlab', 'tensorflow', 'excel',
              'nosql', 'linux', 'azure', 'scikit', 'machine learning', 'statistic',
              'analysis', 'computer science', 'visual', 'ai', 'deep learning',
              'nlp', 'natural language processing', 'neural network', 'mathematic',
              'database', 'oop', 'blockchain',
              'html', 'css', 'javascript', 'jquery', 'git', 'photoshop', 'illustrator',
              'word press', 'seo', 'responsive design', 'php', 'mobile', 'design', 'react',
              'security', 'ruby', 'fireworks', 'json', 'node', 'express', 'redux', 'ajax',
              'java', 'api', 'state management',
              'wireframe', 'ui prototype', 'ux writing', 'interactive design',
              'metric', 'analytic', 'ux research', 'empathy', 'collaborate', 'mockup', 
              'prototype', 'test', 'ideate', 'usability', 'high-fidelity design',
              'framework',
              'swift', 'xcode', 'spatial reasoning', 'human interface', 'core data',
              'grand central', 'network', 'objective-c', 'foundation', 'uikit', 
              'cocoatouch', 'spritekit', 'scenekit', 'opengl', 'metal', 'api', 'iot',
              'karma']

In [8]:
df['tokens_filtered'] = df.apply(lambda x: list(set(x['tokens']) & set(tech_terms)), axis=1)

In [9]:
df.head()

Unnamed: 0,job_title,company,location,description,counts,city,job,low_salary,high_salary,tokens,tokens_filtered
0,Data Scientist,Pactera,,looking fora data scientis architect who has 8...,1319,San Jose,data scientist,,,"[look, forum, datum, scientis, architect, 8, y...","[r, network, design, python]"
1,Data scientist - Global Sales,PayPal,,responsibilities provide business requirement...,1319,San Jose,data scientist,,,"[responsibility, provide, business, requiremen...","[sql, analytic, tableau, design, analysis, pyt..."
2,Data Scientist,Palo Verde Consulting,"Campbell, CA 95008 (Central Campbell area)",job title data scientistlocation campbell ca 9...,1319,San Jose,data scientist,150000.0,210000.0,"[job, title, datum, scientistlocation, campbel...","[network, python]"
3,Data Scientist,Spry Health,"Palo Alto, CA",spry healths mission is to build the worlds la...,1319,San Jose,data scientist,100000.0,135000.0,"[spry, healths, mission, build, world, large, ...","[analytic, design, python, foundation, git, an..."
4,Data Scientist (All Levels) - Santa Clara,LeanTaaS,"Santa Clara, CA 95050",help build technology that saves lives were a...,1319,San Jose,data scientist,,,"[help, build, technology, save, live, fast, gr...","[test, analytic, design, python, analysis, sql]"


In [10]:
# Create a count function
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

In [23]:
def populate_df(title, city):
    j_title = df['job'] == title
    j_city = df['city'] == city
    subset_df = df[j_title & j_city]
    subset_df = subset_df.reset_index()
    
    wc = count(subset_df['tokens_filtered'])
    skills = wc['word'][:10]
    
    data = {'job': title,
            'city': city,
            'counts': subset_df['counts'][0],
            'low_salary': 10000,
            'high_salary': 100000,
            'skills': list(skills)}
    
    return data
    

In [24]:
final_df = pd.DataFrame(columns=['job', 'city', 'counts', 'low_salary', 'high_salary', 'skills'])

test = populate_df('data scientist', 'San Jose')
print(test)
final_df = final_df.append(test, ignore_index=True)

{'job': 'data scientist', 'city': 'San Jose', 'counts': 1319, 'low_salary': 10000, 'high_salary': 100000, 'skills': ['python', 'analysis', 'statistic', 'design', 'r', 'test', 'analytic', 'sql', 'network', 'collaborate']}


In [25]:
test = populate_df('data scientist', 'San Francisco')
print(test)
final_df = final_df.append(test, ignore_index=True)

{'job': 'data scientist', 'city': 'San Francisco', 'counts': 937, 'low_salary': 10000, 'high_salary': 100000, 'skills': ['python', 'analysis', 'statistic', 'r', 'design', 'sql', 'test', 'collaborate', 'analytic', 'tableau']}


In [26]:
final_df.head()

Unnamed: 0,job,city,counts,low_salary,high_salary,skills
0,data scientist,San Jose,1319,10000,100000,"[python, analysis, statistic, design, r, test,..."
1,data scientist,San Francisco,937,10000,100000,"[python, analysis, statistic, r, design, sql, ..."
