In [2]:
import re
import string

# !pip install -U nltk

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize # Sentence Tokenizer
from nltk.tokenize import word_tokenize # Word Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import re
from bs4 import BeautifulSoup
from urllib.request import urlopen

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\EDC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\EDC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# 1) (optional) Scrape 100 Job Listings that contain the title "Data Scientist" from indeed.com

At a minimum your final dataframe of job listings should contain
- Job Title
- Job Description

In [4]:
job_listings = []

target_url = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-4-Sprint-2-NLP/master/module2-Bag-of-Words/job_listings.csv'

html_page = urlopen(target_url)
for html_doc in html_page:
    #html_doc = line
    bs = BeautifulSoup(html_doc, 'html.parser')
    job_listings.append(bs.get_text())

In [None]:
# Here's another way to upload the data -- it's simpler and it has a few variations on how the strings render

# url = "https://raw.githubusercontent.com/LambdaSchool/DS-Unit-4-Sprint-2-NLP/master/module2-Bag-of-Words/job_listings.csv"
# df = pd.read_csv(url, index_col=0)
# df.head()
# df.description[0]

In [8]:
job_listings[0]

'0,"b""Job Requirements:\\nConceptual understanding in Machine Learning models like Nai\\xc2\\xa8ve Bayes, K-Means, SVM, Apriori, Linear/ Logistic Regression, Neural, Random Forests, Decision Trees, K-NN along with hands-on experience in at least 2 of them\\nIntermediate to expert level coding skills in Python/R. (Ability to write functions, clean and efficient data manipulation are mandatory for this role)\\nExposure to packages like NumPy, SciPy, Pandas, Matplotlib etc in Python or GGPlot2, dplyr, tidyR in R\\nAbility to communicate Model findings to both Technical and Non-Technical stake holders\\nHands on experience in SQL/Hive or similar programming language\\nMust show past work via GitHub, Kaggle or any other published article\\nMaster\'s degree in Statistics/Mathematics/Computer Science or any other quant specific field.\\nApply Now""",Data scientistÂ\xa0\n'

In [6]:
# Deleting first item in list
job_listings.pop(0)

',description,title\n'

In [None]:
# Dictionary of job descriptions and job titles
post_dict = {'description': [], 'title': []}
for posting in job_listings:
    # Spliting at quote tickmakrs and comma
    description = re.split(r'",|\',', posting)[0]
    # Converting `\\n` into space and joining
    description = (' ').join(description.split('\\n'))
    # Convering `/` into spaces
    description = (' ').join(description.split('/'))
    post_dict['description'].append(description)
    
    title = re.split(r'",|\',', posting)[1]
    title = title.rstrip('\n')
    post_dict['title'].append(title)

In [None]:
df = pd.DataFrame(post_dict)

In [None]:
df.head()

In [None]:
df.description[0]

# 2) Use NLTK to tokenize / clean the listings 

In [None]:
# Removing numbers and comma before description text
df.description = df.description.str.lstrip('1234567890,')

In [None]:
# Removing 'b's and quote tickmarks before description text
df.description = df.description.str.strip('b\"\'')

In [None]:
df.description.head()

In [None]:
df.description[0]

### Tokenizing Function

In [None]:
# This uses the 3-argument version of str.maketrans
# with arguments (x, y, z) where 'x' and 'y'
# must be equal-length strings and characters in 'x'
# are replaced by characters in 'y'. 'z'
# is a string (string.punctuation here)
# where each character in the string is mapped
# to None
translator = str.maketrans('', '', string.punctuation)

# stopwords set from nltk
stop_words = set(stopwords.words('english'))


# Function to Tokenizing by word
def tokenize(string):
    tokens = word_tokenize(string)
    # print("Tokens:", tokens)
    
    # making all words lowercase
    lowercase_tokens = [w.lower() for w in tokens]
    # print('Lowercase w:', lowercase_tokens)
    
    # Removing punctuation within words using `translator`
    no_punctuation = [x.translate(translator) for x in lowercase_tokens]
    #print("No Punctuation tk:", no_punctuation)
    
    # Keeping only alphabetic words (no non-alphabetic)
    alphabetic = [word for word in no_punctuation if word.isalpha()]
    # print('Alphabetic words:', alphabetic)
    
    # Removing stopwords
    words = [w for w in alphabetic if not w in stop_words]
    # print('Cleaned words:', words)
    
    return words

In [None]:
sample_title = df.title[2]
tokenize(sample_title)

In [None]:
df['tokenized_description'] = df['description'].apply(tokenize)
df['tokenized_title'] = df['title'].apply(tokenize)

df.head()

# 3) Use Scikit-Learn's CountVectorizer to get word counts for each listing.

In [None]:
# Count Vectorizer as "word word word" NOT AS "word", "word", "word"
def word_join(input_string, output_lst):
    joined_string = (" ").join(input_string)
    output_lst.append(joined_string)
    
description_for_countvectorizer = []
df.tokenized_description.apply(lambda x: word_join(x, description_for_countvectorizer))

title_for_countvectorizer = []
df.tokenized_title.apply(lambda x: word_join(x, title_for_countvectorizer))

In [None]:
title_for_countvectorizer

In [None]:
def get_vectorize_df(string):
    # Instantiate vectorizer object
    vectorizer = CountVectorizer()
    
    # Create a vocabulary and get word counts per document
    bag_of_words = vectorizer.fit_transform(string)
    # print(bag_of_words.toarray())
    
    # Get feature names to use as df column headers
    feature_names = vectorizer.get_feature_names()
    
    return pd.DataFrame(bag_of_words.toarray(),
                        columns=feature_names)

In [None]:
df_description_vectorized = get_vectorize_df(
                            description_for_countvectorizer)

df_title_vectorized = get_vectorize_df(
                      title_for_countvectorizer)

In [None]:
df_description_vectorized.head()

In [None]:
df_title_vectorized.head()

# 4) Visualize the most common word counts

In [None]:
# Combining words into 1 list
def data_for_viz(string, output_lst):
    for word in string:
        output_lst.append(word)
        
desc_for_viz = []
df['tokenized_description'].apply(
                            lambda x: data_for_viz(x, desc_for_viz))

title_for_viz = []
df['tokenized_title'].apply(
                            lambda x: data_for_viz(x, title_for_viz))

In [None]:
# Visualizing most common wordsf
def viz_common_words(string, count):
    freq_dist = FreqDist(string)
    print(freq_dist)
    print(freq_dist.most_common(count))
    
    freq_dist.plot(count, cumulative=False)
    plt.show()

In [None]:
viz_common_words(desc_for_viz, 25)

In [None]:
viz_common_words(title_for_viz, 25)

 # 5) Use Scikit-Learn's tfidfVectorizer to get a TF-IDF feature matrix

## `TF-IDF` for `Descriptions`

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# List of document strings as text
text = description_for_countvectorizer

# Instatiate vectorizer object
tf_idf = TfidfVectorizer(ngram_range=(1,1), max_features=20)

# Create a vocabulary and get word counts per doc
feature_matrix = tf_idf.fit_transform(text)
# print(feature_matrix.toarray())

# Get feature names to uas as df column headers
feature_names = tf_idf.get_feature_names()

desc_tf_idf = pd.DataFrame(feature_matrix.toarray(0),
                 columns=feature_names)

In [None]:
print(df_tf_idf.shape)
desc_tf_idf.head()

## `TF-IDF` for `Descriptions`

In [None]:

# List of document strings as text
text_title = title_for_countvectorizer

# Instatiate vectorizer object
tf_idf = TfidfVectorizer(ngram_range=(1,1), max_features=20)

# Create a vocabulary and get word counts per doc
feature_matrix = tf_idf.fit_transform(text_title)
# print(feature_matrix.toarray())

# Get feature names to uas as df column headers
feature_names = tf_idf.get_feature_names()

title_tf_idf = pd.DataFrame(feature_matrix.toarray(0),
                 columns=feature_names)

In [None]:
print(df_tf_idf.shape)
title_tf_idf.head()

## Stretch Goals

 - Scrape Job Listings for the job title "Data Analyst". How do these differ from Data Scientist Job Listings
 - Try and identify requirements for experience specific technologies that are asked for in the job listings. How are those distributed among the job listings?
 - Use a clustering algorithm to cluster documents by their most important terms. Do the clusters reveal any common themes?
  - **Hint:** K-means might not be the best algorithm for this. Do a little bit of research to see what might be good for this. Also, remember that algorithms that depend on Euclidean distance break down with high dimensional data.