In [1]:
import requests #allows programmer to send HTTP requests 
from bs4 import BeautifulSoup #imports python library that allows for scraping the web
import collections #collections provides solutions that would be tricky to implement
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer #imports Natural Language Processing functions
import pandas as pd #imports pandas, a Python library 
from operator import itemgetter #The operator modules exports a set of efficient functions corresponding to intrinsic 
#operators in python(i.e. +,-,*,etc.). Itemgetter allows programmer to retrieve something from a data structure much faster and easier
import numpy as np #imports numpy, a Python library that allows for simple and complex arithmetic calculations 
import nltk #imports natural language toolkit
from nltk.corpus import stopwords #imports stopwords function from natural language toolkit so that all stop words(i.e. and, is, but) can be filtered out
from nltk.tokenize import word_tokenize #imports word_tokenize function

In [2]:
page = requests.get("https://www.linkedin.com/jobs/view/791327763/") #gives you access to webpage

In [3]:
page

<Response [200]>

In [4]:
page.status_code #checks to see whether getting access to webpage was successful

200

In [5]:
page.content #shows content of page



In [6]:
soup = BeautifulSoup(page.content, "html.parser") #parsing webspage using HTML

In [7]:
print(soup.prettify()) #organizes content of webpage so that it is easier to read

<!DOCTYPE html>
<!--[if lt IE 7]> <html lang="en" class="ie ie6 lte9 lte8 lte7 os-other"> <![endif]-->
<!--[if IE 7]> <html lang="en" class="ie ie7 lte9 lte8 lte7 os-other"> <![endif]-->
<!--[if IE 8]> <html lang="en" class="ie ie8 lte9 lte8 os-other"> <![endif]-->
<!--[if IE 9]> <html lang="en" class="ie ie9 lte9 os-other"> <![endif]-->
<!--[if gt IE 9]> <html lang="en" class="os-other"> <![endif]-->
<!--[if !IE]><!-->
<html class="os-other" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <title>
   Electronic Arts hiring Data Scientist in Austin, TX, US | Linkedin Jobs
  </title>
  <meta content="origin" name="referrer"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="e7ccddc3-af83-4e0f-927f-ff1c5a142da7" name="pageImpressionID"/>
  <meta content="chrome" name="appName"/>
  <meta content="d_jobs_job_detail_premium_guest" name="pageKey"/>
  <meta content="coEaecmRVRXgN0zzKSsAAA==" name="treeID"/>
  <meta content="//www.linkedin.com/mob/track

In [8]:
soup.find_all('meta') #finds all text within web page with the meta tag

[<meta charset="utf-8"/>,
 <meta content="origin" name="referrer"/>,
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>,
 <meta content="e7ccddc3-af83-4e0f-927f-ff1c5a142da7" name="pageImpressionID"/>,
 <meta content="chrome" name="appName"/>,
 <meta content="d_jobs_job_detail_premium_guest" name="pageKey"/>,
 <meta content="coEaecmRVRXgN0zzKSsAAA==" name="treeID"/>,
 <meta content="//www.linkedin.com/mob/tracking" name="globalTrackingUrl"/>,
 <meta content="chrome" name="globalTrackingAppName"/>,
 <meta content="webTracking" name="globalTrackingAppId"/>,
 <meta content="https://static.licdn.com/scds/common/u/images/logos/linkedin/logo-in-win8-tile-144_v1.png" name="msapplication-TileImage">
 <meta content="#0077B5" name="msapplication-TileColor">
 <meta content="LinkedIn" name="application-name">
 <meta content="https://static.licdn.com/scds/concat/common/js?v=0.1.576" name="RemoteNavJSContentBaseURL">
 <script src="https://static.licdn.com:443/scds/common/u/lib/fizzy/fz-1.3.8-mi

In [9]:
len(soup.find_all('meta')) #calculates the number of tags with the meta tag

34

In [10]:
text = soup.find_all('meta')[22] #stores the contents of the 22nd tag with the meta tag

In [11]:
text #shows the content of text 

<meta content="Data Scientist, EA Customer Experience#WeAreEA and we exist to inspire the world to play. EA values creativity, pioneering, passion, determination, learning, and teamwork. We mean things like acting with curiosity, speaking up with original ideas, and committing to each other as one team.We’re looking for all the best kinds of people to make great experiences for our players. The best people want a job that inspires them, while giving them room to enjoy their lives. And we want to give them that. We celebrate diversity and inclusion by creating a place where you can come to work and be yourself.You’re a Data Scientist.You Understand That You Have 2 Basic JobsYou know how to pull all types of data from any source, be it Facebook posts, call center chats, video likes or survey results.Help our studios understand what problems players have with their games and provide insights that help optimize our call center. Both pursue the ultimate goal of providing the World’s Best Se

In [12]:
my_text = soup.find('meta', property='og:description') #finds section of text with meta tag and property = 'og:description'

In [13]:
mega_text = my_text.get("content") #gets the content of that meta tag and stores it in mega_text

In [14]:
mega_text #shows content of mega_text 

'Data Scientist, EA Customer Experience#WeAreEA and we exist to inspire the world to play. EA values creativity, pioneering, passion, determination, learning, and teamwork. We mean things like acting with curiosity, speaking up with original ideas, and committing to each other as one team.We’re looking for all the best kinds of people to make great experiences for our players. The best people want a job that inspires them, while giving them room to enjoy their lives. And we want to give them that. We celebrate diversity and inclusion by creating a place where you can come to work and be yourself.You’re a Data Scientist.You Understand That You Have 2 Basic JobsYou know how to pull all types of data from any source, be it Facebook posts, call center chats, video likes or survey results.Help our studios understand what problems players have with their games and provide insights that help optimize our call center. Both pursue the ultimate goal of providing the World’s Best Service for the 

### Creating empty dictionary so that the word and its word count in main body of job description can be stored in this dictionary. The problem with this approach is that there are way too many duplilcates in dictionary plus there are numerous stop words with the highest word counts in job description that will need to be removed 

In [15]:
word_dict = {} #initializes empty dictionary 
for i in mega_text.split(): #loops through each word in mega_text
    print(i, mega_text.count(i)) #prints word and its word count in mega_text 
    word_dict[i] = mega_text.count(i) #stores word and word count in word_dict dictionary 

Data 7
Scientist, 1
EA 8
Customer 6
Experience#WeAreEA 1
and 57
we 15
exist 1
to 68
inspire 2
the 58
world 3
to 68
play. 1
EA 8
values 1
creativity, 1
pioneering, 1
passion, 1
determination, 1
learning, 1
and 57
teamwork. 1
We 18
mean 1
things 3
like 4
acting 1
with 16
curiosity, 1
speaking 1
up 12
with 16
original 1
ideas, 1
and 57
committing 1
to 68
each 4
other 5
as 24
one 7
team.We’re 1
looking 3
for 14
all 11
the 58
best 3
kinds 1
of 27
people 7
to 68
make 7
great 4
experiences 4
for 14
our 24
players. 2
The 3
best 3
people 7
want 3
a 492
job 1
that 14
inspires 1
them, 1
while 2
giving 1
them 3
room 1
to 68
enjoy 1
their 6
lives. 1
And 2
we 15
want 3
to 68
give 1
them 3
that. 1
We 18
celebrate 1
diversity 1
and 57
inclusion 1
by 2
creating 1
a 492
place 1
where 2
you 33
can 8
come 2
to 68
work 12
and 57
be 13
yourself.You’re 1
a 492
Data 7
Scientist.You 1
Understand 1
That 1
You 14
Have 2
2 2
Basic 1
JobsYou 1
know 4
how 8
to 68
pull 1
all 11
types 1
of 27
data 3
from 5
any 11
sou

In [16]:
word_dict #prints contents of dictionary 

{'Data': 7,
 'Scientist,': 1,
 'EA': 8,
 'Customer': 6,
 'Experience#WeAreEA': 1,
 'and': 57,
 'we': 15,
 'exist': 1,
 'to': 68,
 'inspire': 2,
 'the': 58,
 'world': 3,
 'play.': 1,
 'values': 1,
 'creativity,': 1,
 'pioneering,': 1,
 'passion,': 1,
 'determination,': 1,
 'learning,': 1,
 'teamwork.': 1,
 'We': 18,
 'mean': 1,
 'things': 3,
 'like': 4,
 'acting': 1,
 'with': 16,
 'curiosity,': 1,
 'speaking': 1,
 'up': 12,
 'original': 1,
 'ideas,': 1,
 'committing': 1,
 'each': 4,
 'other': 5,
 'as': 24,
 'one': 7,
 'team.We’re': 1,
 'looking': 3,
 'for': 14,
 'all': 11,
 'best': 3,
 'kinds': 1,
 'of': 27,
 'people': 7,
 'make': 7,
 'great': 4,
 'experiences': 4,
 'our': 24,
 'players.': 2,
 'The': 3,
 'want': 3,
 'a': 492,
 'job': 1,
 'that': 14,
 'inspires': 1,
 'them,': 1,
 'while': 2,
 'giving': 1,
 'them': 3,
 'room': 1,
 'enjoy': 1,
 'their': 6,
 'lives.': 1,
 'And': 2,
 'give': 1,
 'that.': 1,
 'celebrate': 1,
 'diversity': 1,
 'inclusion': 1,
 'by': 2,
 'creating': 1,
 'place'

### In this approach, the Count Vectorizer(a Natural Language Processing technique) function is used to compute the word count of each word in main body of job description. 

In [17]:
cv = CountVectorizer(stop_words=['and', 'And', 'to', 'or', 'a'] ) #initializes CountVectorizer function with specific stop words

In [18]:
cv.fit(mega_text.split()) #fits all of the words inside mega_text with CountVectorizer function 

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['and', 'And', 'to', 'or', 'a'], strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [19]:
cv_all = cv.transform(mega_text.split()) #transforms words with CountVectorizer function 

In [20]:
cv_all #displays contents of cv_all, which is currently a sparse matrix 

<1171x533 sparse matrix of type '<class 'numpy.int64'>'
	with 1069 stored elements in Compressed Sparse Row format>

In [21]:
type(cv_all) #shows the data type of cv_all

scipy.sparse.csr.csr_matrix

In [22]:
cv_df = pd.DataFrame(cv_all.todense(), columns=cv.get_feature_names()) #converts the sparse matrix cv_all into Data Frame cv_df

In [23]:
cv_df.shape #shows the number of rows and columns in new dataframe

(1171, 533)

In [24]:
j = 0 #initializes j to 0
for i in mega_text.split(): #loops through words in mega_text
    j+=1 #increments j

print(j) #prints j, or the number of words that are in mega_text 

1171


In [25]:
cv_df.columns #shows the columns in cv_df dataframe 

Index(['24hrs', '401k', 'about', 'acting', 'active', 'add', 'advanced',
       'advisor', 'advisors', 'advocate',
       ...
       'write', 'writing', 'wrote', 'www', 'yeah', 'years', 'you', 'your',
       'yourself', 'zepplin'],
      dtype='object', length=533)

In [26]:
w_dict = {} #initializes empty dictionary 
for i in cv_df.columns: #loops through columns in cv_df dataframe
    w_dict[i] = cv_df[i].sum() #stores the word as key and its word count as a value in w_dict dictionary 
    
print(w_dict) #prints the contents of dictionary 

{'24hrs': 1, '401k': 1, 'about': 9, 'acting': 1, 'active': 1, 'add': 1, 'advanced': 1, 'advisor': 1, 'advisors': 1, 'advocate': 1, 'after': 1, 'algorithm': 1, 'all': 6, 'allows': 1, 'also': 1, 'always': 1, 'an': 6, 'analyses': 1, 'analysis': 1, 'analyst': 1, 'analysts': 1, 'analytics': 1, 'answersnlp': 1, 'any': 7, 'anything': 1, 'apart': 1, 'application': 1, 'approachable': 1, 'are': 6, 'around': 1, 'arts': 1, 'as': 4, 'asked': 1, 'at': 6, 'audience': 2, 'austinelectronic': 1, 'available': 1, 'avoid': 1, 'awesome': 1, 'aws': 1, 'bachelor': 1, 'back': 1, 'based': 1, 'basic': 2, 'be': 6, 'because': 1, 'benefit': 1, 'best': 5, 'better': 1, 'bi': 1, 'big': 2, 'blogs': 1, 'both': 2, 'brag': 1, 'break': 1, 'brings': 1, 'business': 1, 'but': 1, 'by': 2, 'call': 3, 'can': 6, 'candisplay': 1, 'care': 3, 'careers': 2, 'catch': 1, 'celebrate': 1, 'center': 3, 'change': 2, 'changes': 1, 'changetailor': 1, 'chat': 2, 'chats': 1, 'check': 1, 'class': 2, 'code': 5, 'coding': 1, 'coffee': 1, 'collabo

In [27]:
sorted(w_dict.items(), key=lambda x:x[1], reverse=True) #takes the word count of every word in mega_text and sorts it from highest 
#word count to lowest word count 

[('the', 38),
 ('you', 35),
 ('we', 21),
 ('of', 20),
 ('re', 17),
 ('with', 16),
 ('on', 15),
 ('that', 15),
 ('in', 14),
 ('for', 12),
 ('our', 10),
 ('about', 9),
 ('data', 9),
 ('experience', 9),
 ('ea', 8),
 ('is', 8),
 ('work', 8),
 ('your', 8),
 ('any', 7),
 ('customer', 7),
 ('games', 7),
 ('make', 7),
 ('people', 7),
 ('players', 7),
 ('team', 7),
 ('up', 7),
 ('all', 6),
 ('an', 6),
 ('are', 6),
 ('at', 6),
 ('be', 6),
 ('can', 6),
 ('how', 6),
 ('their', 6),
 ('best', 5),
 ('code', 5),
 ('experiences', 5),
 ('from', 5),
 ('help', 5),
 ('it', 5),
 ('last', 5),
 ('other', 5),
 ('right', 5),
 ('they', 5),
 ('understand', 5),
 ('what', 5),
 ('world', 5),
 ('as', 4),
 ('expert', 4),
 ('great', 4),
 ('language', 4),
 ('problems', 4),
 ('science', 4),
 ('time', 4),
 ('who', 4),
 ('call', 3),
 ('care', 3),
 ('center', 3),
 ('day', 3),
 ('each', 3),
 ('have', 3),
 ('insights', 3),
 ('into', 3),
 ('know', 3),
 ('like', 3),
 ('looking', 3),
 ('need', 3),
 ('next', 3),
 ('one', 3),
 ('p

In [28]:
#Uses OrderedDict function in order to sort the word count of each word from highest to lowest 
d = collections.OrderedDict(sorted(w_dict.items(), key=itemgetter(1), reverse=True)) 

In [29]:
d #generates content of d 

OrderedDict([('the', 38),
             ('you', 35),
             ('we', 21),
             ('of', 20),
             ('re', 17),
             ('with', 16),
             ('on', 15),
             ('that', 15),
             ('in', 14),
             ('for', 12),
             ('our', 10),
             ('about', 9),
             ('data', 9),
             ('experience', 9),
             ('ea', 8),
             ('is', 8),
             ('work', 8),
             ('your', 8),
             ('any', 7),
             ('customer', 7),
             ('games', 7),
             ('make', 7),
             ('people', 7),
             ('players', 7),
             ('team', 7),
             ('up', 7),
             ('all', 6),
             ('an', 6),
             ('are', 6),
             ('at', 6),
             ('be', 6),
             ('can', 6),
             ('how', 6),
             ('their', 6),
             ('best', 5),
             ('code', 5),
             ('experiences', 5),
             ('from', 5),
       

### Using Natural Language Toolkit(NLTK) to determine word count so that all of the stop words can be filtered with just a couple of lines of code. Provides a faster and more efficient way of determining the word count of only words that matter in a job description 

In [30]:
nltk.__version__ #gets current version of nltk


'3.3'

In [31]:
stop_words =set(stopwords.words('English')) #finds all of the stop words in the English language

In [32]:
real_words = [i for i in mega_text.split() if i not in stop_words] #gathers a list of non-stopwords from the meta tag job description

In [33]:
real_words #shows all of the real words(non-stopwords )

['Data',
 'Scientist,',
 'EA',
 'Customer',
 'Experience#WeAreEA',
 'exist',
 'inspire',
 'world',
 'play.',
 'EA',
 'values',
 'creativity,',
 'pioneering,',
 'passion,',
 'determination,',
 'learning,',
 'teamwork.',
 'We',
 'mean',
 'things',
 'like',
 'acting',
 'curiosity,',
 'speaking',
 'original',
 'ideas,',
 'committing',
 'one',
 'team.We’re',
 'looking',
 'best',
 'kinds',
 'people',
 'make',
 'great',
 'experiences',
 'players.',
 'The',
 'best',
 'people',
 'want',
 'job',
 'inspires',
 'them,',
 'giving',
 'room',
 'enjoy',
 'lives.',
 'And',
 'want',
 'give',
 'that.',
 'We',
 'celebrate',
 'diversity',
 'inclusion',
 'creating',
 'place',
 'come',
 'work',
 'yourself.You’re',
 'Data',
 'Scientist.You',
 'Understand',
 'That',
 'You',
 'Have',
 '2',
 'Basic',
 'JobsYou',
 'know',
 'pull',
 'types',
 'data',
 'source,',
 'Facebook',
 'posts,',
 'call',
 'center',
 'chats,',
 'video',
 'likes',
 'survey',
 'results.Help',
 'studios',
 'understand',
 'problems',
 'players',

In [34]:
word_dict1 = {} #creates empty dictionary 
for i in real_words: #loops through all of the non-stopwords in job description 
    print(i, real_words.count(i)) #prints the non-stopwords along with its word count 
    word_dict1[i] = mega_text.count(i) #stores non-stopword as a key and its word count as a value in dictionary 

Data 7
Scientist, 1
EA 4
Customer 5
Experience#WeAreEA 1
exist 1
inspire 1
world 1
play. 1
EA 4
values 1
creativity, 1
pioneering, 1
passion, 1
determination, 1
learning, 1
teamwork. 1
We 5
mean 1
things 3
like 3
acting 1
curiosity, 1
speaking 1
original 1
ideas, 1
committing 1
one 3
team.We’re 1
looking 3
best 3
kinds 1
people 6
make 7
great 4
experiences 2
players. 2
The 2
best 3
people 6
want 3
job 1
inspires 1
them, 1
giving 1
room 1
enjoy 1
lives. 1
And 2
want 3
give 1
that. 1
We 5
celebrate 1
diversity 1
inclusion 1
creating 1
place 1
come 1
work 6
yourself.You’re 1
Data 7
Scientist.You 1
Understand 1
That 1
You 5
Have 1
2 1
Basic 1
JobsYou 1
know 3
pull 1
types 1
data 2
source, 1
Facebook 1
posts, 1
call 2
center 1
chats, 1
video 1
likes 1
survey 1
results.Help 1
studios 1
understand 4
problems 3
players 4
games 4
provide 2
insights 2
help 4
optimize 1
call 2
center. 1
Both 1
pursue 1
ultimate 1
goal 1
providing 2
World’s 2
Best 2
Service 1
World’s 2
Best 2
Games.You 1
use 3
res

In [35]:
word_dict1 #generates contents of dictionary 

{'Data': 7,
 'Scientist,': 1,
 'EA': 8,
 'Customer': 6,
 'Experience#WeAreEA': 1,
 'exist': 1,
 'inspire': 2,
 'world': 3,
 'play.': 1,
 'values': 1,
 'creativity,': 1,
 'pioneering,': 1,
 'passion,': 1,
 'determination,': 1,
 'learning,': 1,
 'teamwork.': 1,
 'We': 18,
 'mean': 1,
 'things': 3,
 'like': 4,
 'acting': 1,
 'curiosity,': 1,
 'speaking': 1,
 'original': 1,
 'ideas,': 1,
 'committing': 1,
 'one': 7,
 'team.We’re': 1,
 'looking': 3,
 'best': 3,
 'kinds': 1,
 'people': 7,
 'make': 7,
 'great': 4,
 'experiences': 4,
 'players.': 2,
 'The': 3,
 'want': 3,
 'job': 1,
 'inspires': 1,
 'them,': 1,
 'giving': 1,
 'room': 1,
 'enjoy': 1,
 'lives.': 1,
 'And': 2,
 'give': 1,
 'that.': 1,
 'celebrate': 1,
 'diversity': 1,
 'inclusion': 1,
 'creating': 1,
 'place': 1,
 'come': 2,
 'work': 12,
 'yourself.You’re': 1,
 'Scientist.You': 1,
 'Understand': 1,
 'That': 1,
 'You': 14,
 'Have': 2,
 '2': 2,
 'Basic': 1,
 'JobsYou': 1,
 'know': 4,
 'pull': 1,
 'types': 1,
 'data': 3,
 'source,':

In [36]:
sorted(word_dict1.items(), key=itemgetter(1), reverse=True) #takes the word count of each non-stopword and stores it in a list

[('We', 18),
 ('You', 14),
 ('play', 14),
 ('work', 12),
 ('–', 12),
 ('player', 10),
 ('team', 10),
 ('experience', 9),
 ('EA', 8),
 ('Data', 7),
 ('one', 7),
 ('people', 7),
 ('make', 7),
 ('players', 7),
 ('Experience', 7),
 ('Customer', 6),
 ('understand', 6),
 ('games', 6),
 ('take', 6),
 ('R', 6),
 ('You’re', 5),
 ('care', 5),
 ('code', 5),
 ('change', 5),
 ('last', 5),
 ('you’re', 5),
 ('right', 5),
 ('model', 5),
 ('like', 4),
 ('great', 4),
 ('experiences', 4),
 ('know', 4),
 ('problems', 4),
 ('help', 4),
 ('use', 4),
 ('person', 4),
 ('next', 4),
 ('time', 4),
 ('part', 4),
 ('We’re', 4),
 ('Science', 4),
 ('day', 4),
 ('project', 4),
 ('side', 4),
 ('plan', 4),
 ('NLP', 4),
 ('test', 4),
 ('live', 4),
 ('world', 3),
 ('things', 3),
 ('looking', 3),
 ('best', 3),
 ('The', 3),
 ('want', 3),
 ('data', 3),
 ('solve', 3),
 ('new', 3),
 ('language', 3),
 ('expert', 3),
 ('organization', 3),
 ('need', 3),
 ('tech', 3),
 ('chat', 3),
 ('results', 3),
 ('find', 3),
 ('win', 3),
 ('p