In [1]:
%matplotlib inline
import string
import collections
import pandas as pd
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.graph_objects as go
from PIL import Image
import plotly.offline as pyo

import re
import nltk
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import urllib
import requests

from src.get_skills import *
from src.get_industries import *
import textwrap

## Data Loader

In [2]:
raw_data = pd.read_csv('data/data_scientist_united_states_job_postings_jobspikr.csv')

## Get the Skills from Job Descriptions

In [3]:
raw_data.loc[:, 'skills'] = raw_data['job_description'].apply(lambda x: get_skills(x))

KeyboardInterrupt: 

## Visualization of the most common Tools

In [None]:
skills = np.concatenate(raw_data['skills'].values)
tools = [word for word in skills if word not in majors+keywords+degree]
toolcount = Counter(tools)
toolcount.most_common(25)

In [None]:
tools_df = pd.DataFrame({'count': list(toolcount.values()), 'tool': list(toolcount.keys())})
tools_df = tools_df.sort_values(by=['count'], ascending=False)

fig = go.Figure(data=go.Scatterpolar(
  r=tools_df.head(12)["count"],
  theta=tools_df.head(12)["tool"],
  fill='toself'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True
    ),
  ),
  showlegend=False
)

fig.show()

## Visualization of the most sought after Majors

In [None]:
major = [word for word in skills if word in majors]
majorcount = Counter(major)

In [None]:
majorcount

In [None]:
major_df = pd.DataFrame({'count': list(majorcount.values()), 'major': list(majorcount.keys())})
major_df = major_df.sort_values(by=['count'], ascending=False)
major_df['major'] = major_df['major'].apply(lambda x: string.capwords(x))
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize = (18,8))
ax = sns.barplot(x="count", y="major", data=major_df, color=sns.xkcd_rgb["windows blue"])
ax.grid(b=True, color='grey', linestyle='-.', linewidth=0.5, alpha=0.2)
for i in ax.patches:
    ax.text(i.get_width()+5, i.get_y()+0.5, str(round((i.get_width()), 2)),
            fontsize=15, color='grey') #fontweight=bold
ax.tick_params(labelsize=16)
ax.set_xlabel('Count',fontsize=20)
ax.set_ylabel('Major',fontsize=20)

## Visualization of most sought after Qualification

In [None]:
qualification = [word for word in skills if word in degree]
qualificationcount = Counter(qualification)

In [None]:
qualificationcount

In [None]:
fig1, ax1 = plt.subplots(figsize=(10, 12))
wedges, texts, autotexts = ax1.pie(qualificationcount.values(), labels=qualificationcount.keys(), autopct='%1.1f%%',
        shadow=False, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.setp(autotexts, size=30, weight="bold")
plt.setp(texts, size=20)
plt.show()

## Keywords most often seen in Job Descriptions

In [None]:
keyword = [word for word in skills if word in keywords]
keywordcount = Counter(keyword)

In [None]:
keywordcount

In [None]:
mask = np.array(Image.open(requests.get('https://cdn0.iconfinder.com/data/icons/basic-shapes-outline-3/640/outline_diamond-512.png', stream=True).raw))
wordcloud = WordCloud(width=1800,height=3000, max_words=13,max_font_size=50, min_font_size=10,background_color="white",mask=mask).generate_from_frequencies(keywordcount)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


## Industry Analysis

In [None]:

#Make a box and whisker plot for salaries by industry
#only 5% of jobs had salary reported (means important to know what to expect when offered)

raw_data = pd.read_csv('data/data_scientist_united_states_job_postings_jobspikr.csv')
raw_data['category'] = new_categories(raw_data)
data = raw_data[pd.notnull(raw_data['salary_offered'])]
data = data[data.salary_offered != 'Negotiable']
data = data[data.salary_offered != 'Salary Range: Undisclosed']
data = data.drop([2069], axis=0)
list_sal=list(data['salary_offered'])
for i in range(len(list_sal)):
    x = ''.join([c for c in list_sal[i] if c in '1234567890-'])
    hi_lo= x.split('-');
    hi_lo=[float(k) for k in hi_lo];
    if hi_lo[1]-hi_lo[0]>400:   #remove erroneous salary ranges
        hi_lo[1]=hi_lo[0]
    list_sal[i]=sum(hi_lo)/2
    if list_sal[i]>10000:
        list_sal[i]=float(list_sal[i])/1000
data['salary']=list_sal
data = data[data.category != 'education']
data= data[data.category != 'arts & entertainment']
#Boxplot formatting
green_diamond = dict(markerfacecolor='gold', marker='o')
boxprops = dict(linewidth=3, color='darkgoldenrod')
medianprops = dict(color='red',linewidth=2.5)
whiskerprops = dict(linewidth=3, color='firebrick')
boxplot=data.boxplot(column='salary', by='category',grid=False, fontsize=10,vert=False,
                    boxprops=boxprops, whiskerprops=whiskerprops, medianprops=medianprops, 
                    capprops=whiskerprops, flierprops=green_diamond,
                    patch_artist=True,return_type='dict')
[[item.set_color('tomato') for item in boxplot[key]['boxes']] for key in boxplot.keys()]
[[item.set_color('darkred') for item in boxplot[key]['medians']] for key in boxplot.keys()]
[[item.set_color('k') for item in boxplot[key]['whiskers']] for key in boxplot.keys()]
[[item.set_color('k') for item in boxplot[key]['caps']] for key in boxplot.keys()]
plt.title('Expected salary by industry')
plt.suptitle('')
plt.ylabel('Industry')
plt.xlabel('Salary (thousand dollars)')
plt.show()
#Save boxchart
#plt.savefig('myboxchart.png', dpi=1200,transparent=True, bbox_inches='tight') 

In [None]:
#Make a bubble plot of the industries. Larger bubbles=more jobs in that industry
raw_data = pd.read_csv('data/data_scientist_united_states_job_postings_jobspikr.csv')
new_data_cat=new_categories(raw_data)
indus=new_data_cat.groupby('category')['category'].count().reset_index(name='Count')
#indus=indus.sort_values(by='Count', ascending=False)
indus['percent']=[0, 0, 0, 80, 80, 80, 0, 0, 80, 80, 0, 0, 80, 80]
b, c = indus.iloc[0].copy(), indus.iloc[9].copy() #swap some rows for appearance
indus.iloc[0],indus.iloc[9] = c,b
b, c = indus.iloc[3].copy(), indus.iloc[7].copy()
indus.iloc[3],indus.iloc[7] = c,b
colors=[]
for i in indus['percent']:
    if i == 0:
        colors.append('plum')
    else:
        colors.append('mediumvioletred')
alist=list(range(indus['category'].size))
alist2=[1250, 600, 1600, 200, 750, 1700, 1250, 250, 900, 1500, 450, 1100, 1600, 800]
myxaxis=pd.Series(alist)
myyaxis=pd.Series(alist2)
#plot attributes
plt.figure(figsize=(8,6))
plt.title('What industries do data scientists work in?', fontdict={'fontsize':20})
plt.ylim(-350,2100)
plt.xlim(-2, 15)
plt.scatter(myxaxis,myyaxis, s=indus.Count*7,c=colors,edgecolor='None')
x,y=myxaxis,myyaxis
for i, txt in enumerate (indus['category']):
    s=''
    for k in textwrap.wrap(txt,width=15):
        s=s+k+'\n'
    plt.annotate(s.upper(),(x[i],y[i]), wrap=True, weight='bold', size=7, horizontalalignment='center',
                  verticalalignment='center', fontstretch='semi-condensed', family='fantasy')
leg=[mpatches.Patch(color='mediumvioletred', label='Comprise 80% of Data Science Jobs'), mpatches.Patch(color='plum', label='Other 20% of Data Science Jobs')]
plt.legend(handles=leg, loc='lower left')
plt.axis('off')
plt.show()
#Save bubblechart
#plt.savefig('mybubblechart.png', dpi=1200,transparent=True) 

## Location Analysis

In [None]:
data = pd.read_csv('data/data_scientist_united_states_job_postings_jobspikr.csv')
data_location = data[pd.notnull(data['state'])]
state=data_location.groupby(['state'])['job_title'].count()
state=state.reset_index()
state=state.sort_values(['job_title'],ascending=False)
print('Top ten states offering the most jobs',state.head(10))

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=["CA","NY","VA","TX","MA","IL","WA","NJ","PA","MD"],
    z=state['job_title'],
    locationmode='USA-states',
    colorscale='rdbu',
    autocolorscale=False,
    text= ['California','New York','Virginia','Texas','Massachusetts','Illinois', 'Washington','New Jersey','Pennsylvania','Maryland']    ,
    marker_line_color='white', # line markers between states
    colorbar_title="Number of Jobs"
))

fig.update_layout(
    title_text='Density of job openings in different states in USA',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)'),
)

fig.show()
