In [None]:
import requests #allows programmer to send HTTP requests 
from bs4 import BeautifulSoup #imports python library that allows for scraping the web
import collections #collections provides solutions that would be tricky to implement
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer #imports Natural Language Processing functions
import pandas as pd #imports pandas, a Python library 
from operator import itemgetter #The operator modules exports a set of efficient functions corresponding to intrinsic 
#operators in python(i.e. +,-,*,etc.). Itemgetter allows programmer to retrieve something from a data structure much faster and easier
import numpy as np #imports numpy, a Python library that allows for simple and complex arithmetic calculations 
import nltk #imports natural language toolkit
from nltk.corpus import stopwords #imports stopwords function from natural language toolkit so that all stop words(i.e. and, is, but) can be filtered out
from nltk.tokenize import word_tokenize #imports word_tokenize function

In [None]:
page = requests.get("https://www.linkedin.com/jobs/view/791327763/") #gives you access to webpage

In [None]:
page

In [None]:
page.status_code #checks to see whether getting access to webpage was successful

In [None]:
page.content #shows content of page

In [None]:
soup = BeautifulSoup(page.content, "html.parser") #parsing webspage using HTML

In [None]:
print(soup.prettify()) #organizes content of webpage so that it is easier to read

In [None]:
soup.find_all('meta') #finds all text within web page with the meta tag

In [None]:
len(soup.find_all('meta')) #calculates the number of tags with the meta tag

In [None]:
text = soup.find_all('meta')[22] #stores the contents of the 22nd tag with the meta tag

In [None]:
text #shows the content of text 

In [None]:
my_text = soup.find('meta', property='og:description') #finds section of text with meta tag and property = 'og:description'

In [None]:
mega_text = my_text.get("content") #gets the content of that meta tag and stores it in mega_text

In [None]:
mega_text #shows content of mega_text 

### Creating empty dictionary so that the word and its word count in main body of job description can be stored in this dictionary. The problem with this approach is that there are way too many duplilcates in dictionary plus there are numerous stop words with the highest word counts in job description that will need to be removed 

In [None]:
word_dict = {} #initializes empty dictionary 
for i in mega_text.split(): #loops through each word in mega_text
    print(i, mega_text.count(i)) #prints word and its word count in mega_text 
    word_dict[i] = mega_text.count(i) #stores word and word count in word_dict dictionary 

In [None]:
word_dict #prints contents of dictionary 

### In this approach, the Count Vectorizer(a Natural Language Processing technique) function is used to compute the word count of each word in main body of job description. 

In [None]:
cv = CountVectorizer(stop_words=['and', 'And', 'to', 'or', 'a'] ) #initializes CountVectorizer function with specific stop words

In [None]:
cv.fit(mega_text.split()) #fits all of the words inside mega_text with CountVectorizer function 

In [None]:
cv_all = cv.transform(mega_text.split()) #transforms words with CountVectorizer function 

In [None]:
cv_all #displays contents of cv_all, which is currently a sparse matrix 

In [None]:
type(cv_all) #shows the data type of cv_all

In [None]:
cv_df = pd.DataFrame(cv_all.todense(), columns=cv.get_feature_names()) #converts the sparse matrix cv_all into Data Frame cv_df

In [None]:
cv_df.shape #shows the number of rows and columns in new dataframe

In [None]:
j = 0 #initializes j to 0
for i in mega_text.split(): #loops through words in mega_text
    j+=1 #increments j

print(j) #prints j, or the number of words that are in mega_text 

In [None]:
cv_df.columns #shows the columns in cv_df dataframe 

In [None]:
w_dict = {} #initializes empty dictionary 
for i in cv_df.columns: #loops through columns in cv_df dataframe
    w_dict[i] = cv_df[i].sum() #stores the word as key and its word count as a value in w_dict dictionary 
    
print(w_dict) #prints the contents of dictionary 

In [None]:
sorted(w_dict.items(), key=lambda x:x[1], reverse=True) #takes the word count of every word in mega_text and sorts it from highest 
#word count to lowest word count 

In [None]:
#Uses OrderedDict function in order to sort the word count of each word from highest to lowest 
d = collections.OrderedDict(sorted(w_dict.items(), key=itemgetter(1), reverse=True)) 

In [None]:
d #generates content of d 

### Using Natural Language Toolkit(NLTK) to determine word count so that all of the stop words can be filtered with just a couple of lines of code. Provides a faster and more efficient way of determining the word count of only words that matter in a job description 

In [None]:
nltk.__version__ #gets current version of nltk


In [None]:
stop_words =set(stopwords.words('English')) #finds all of the stop words in the English language

In [None]:
real_words = [i for i in mega_text.split() if i not in stop_words] #gathers a list of non-stopwords from the meta tag job description

In [None]:
real_words #shows all of the real words(non-stopwords )

In [None]:
word_dict1 = {} #creates empty dictionary 
for i in real_words: #loops through all of the non-stopwords in job description 
    print(i, real_words.count(i)) #prints the non-stopwords along with its word count 
    word_dict1[i] = mega_text.count(i) #stores non-stopword as a key and its word count as a value in dictionary 

In [None]:
word_dict1 #generates contents of dictionary 

In [None]:
sorted(word_dict1.items(), key=itemgetter(1), reverse=True) #takes the word count of each non-stopword and stores it in a list