In [1]:
! pwd

/Users/imanithompson/Documents/Flatiron/Project/Job-Posting-Analysis/Notebooks


In [11]:
! ls ../../../../../Downloads/Data/fake_job_postings.csv

fake_job_postings.csv     training_set_features.csv training_set_labels.csv


#### Setting up notebook

In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve

In [6]:
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
import string
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,\
HashingVectorizer

In [7]:
# Use this to download the stopwords if you haven't already - only ever needs to be run once
nltk.download("stopwords")

# Same with wordnet!
nltk.download('wordnet')

# And with parts of speech tagging!
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/imanithompson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/imanithompson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/imanithompson/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/imanithompson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
data = pd.read_csv('../../../../../Downloads/Data/fake_job_postings.csv')

In [9]:
data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [10]:
data.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [11]:
data.shape #rows,columns

(17880, 18)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15185 non-null  object
 8   benefits             10670 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [13]:
data.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

## Salary range

In [14]:
list(data['salary_range'].value_counts().index)

['0-0',
 '40000-50000',
 '30000-40000',
 '45000-67000',
 '25000-30000',
 '30000-50000',
 '80000-100000',
 '35000-45000',
 '70000-90000',
 '50000-80000',
 '55000-75000',
 '50000-70000',
 '60000-80000',
 '30000-35000',
 '40000-45000',
 '40000-60000',
 '25000-35000',
 '50000-60000',
 '45000-50000',
 '40000-70000',
 '35000-40000',
 '100000-120000',
 '80000-120000',
 '18000-20000',
 '35000-50000',
 '80000-110000',
 '60000-75000',
 '30000-45000',
 '20000-25000',
 '20000-30000',
 '4500-4500',
 '140000-150000',
 '50000-65000',
 '65000-80000',
 '45000-60000',
 '55000-65000',
 '100000-150000',
 '70000-80000',
 '45000-55000',
 '90000-110000',
 '90000-120000',
 '60000-90000',
 '28000-32000',
 '20000-24000',
 '60000-70000',
 '25000-40000',
 '40000-100000',
 '60000-100000',
 '70000-100000',
 '45000-65000',
 '16000-18500',
 '35000-65000',
 '50000-55000',
 '40000-55000',
 '7200-1380000',
 '75000-100000',
 '100000-130000',
 '65000-75000',
 '20000-40000',
 '55000-70000',
 '65000-85000',
 '23000-28000',


In [15]:
data['salary_range'].value_counts()[0:57]

0-0              142
40000-50000       66
30000-40000       55
45000-67000       37
25000-30000       37
30000-50000       32
80000-100000      30
35000-45000       30
70000-90000       30
50000-80000       29
55000-75000       28
50000-70000       28
60000-80000       28
30000-35000       27
40000-45000       27
40000-60000       27
25000-35000       26
50000-60000       24
45000-50000       22
40000-70000       21
35000-40000       20
100000-120000     20
80000-120000      19
18000-20000       18
35000-50000       18
80000-110000      17
60000-75000       17
30000-45000       17
20000-25000       17
20000-30000       16
4500-4500         16
140000-150000     16
50000-65000       16
65000-80000       15
45000-60000       15
55000-65000       14
100000-150000     14
70000-80000       14
45000-55000       14
90000-110000      14
90000-120000      13
60000-90000       13
28000-32000       12
20000-24000       12
60000-70000       12
25000-40000       12
40000-100000      12
60000-100000 

In [16]:
data['salary_range'].value_counts().values[0:57]

array([142,  66,  55,  37,  37,  32,  30,  30,  30,  29,  28,  28,  28,
        27,  27,  27,  26,  24,  22,  21,  20,  20,  19,  18,  18,  17,
        17,  17,  17,  16,  16,  16,  16,  15,  15,  14,  14,  14,  14,
        14,  13,  13,  12,  12,  12,  12,  12,  11,  11,  11,  11,  11,
        10,  10,  10,  10,  10])

## Required experience

In [17]:
data['required_experience'].value_counts()

Mid-Senior level    3809
Entry level         2697
Associate           2297
Not Applicable      1116
Director             389
Internship           381
Executive            141
Name: required_experience, dtype: int64

## Department

In [18]:
list(data['department'].value_counts().index)

['Sales',
 'Engineering',
 'Marketing',
 'Operations',
 'IT',
 'Development',
 'Product',
 'Information Technology',
 'Technology',
 'Design',
 'Customer Service',
 'Finance',
 'HR',
 'tech',
 'R&D',
 'Creative',
 'Client Services',
 'Retail',
 'Product Development',
 'Business Development',
 'Oil and Gas',
 'Production',
 'CSD Relay',
 'Administrative',
 'Maintenance',
 'Human Resources',
 'Tech',
 'Accounting',
 'Technical',
 'Administration',
 'Clerical',
 'Editorial',
 'Oil & Energy',
 'Legal',
 'IT Services',
 'Department',
 'Performance Marketing',
 'Content',
 'Project Management',
 'Squiz ',
 'Admin',
 'Commercial',
 'QA',
 'Creative Services',
 'Engineering ',
 'Support',
 'International Growth',
 'Didactics',
 'All',
 'Customer Success',
 'Customer Support',
 'Management',
 'Marketing ',
 'CS',
 'Information Technology ',
 'Warehouse',
 'Product Team',
 'Engagement',
 'Education',
 'Product Innovation',
 'Sales ',
 'Permanent',
 'Account Management',
 'customer service',
 'Bu

In [19]:
# data['department'] = data['department'].replace('IT', "Technology").replace(
# 'Information Technology','Technology').replace('tech', 'Technology').replace(
# 'Technical', 'Technology').replace('Tech', 'Technology').replace(
# 'IT Services', 'Technology').replace('Information Technology ', 'Technology').replace(
# 'Technical Support', 'Technology')


# Had an good idea until I realized how time consuming this is 

In [20]:
data.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [21]:
list(data['industry'].value_counts().index)
# maybe drop departments all togehter BECAUSE OF ALL THE NULLS and have industry
# area in which your company is working

['Information Technology and Services',
 'Computer Software',
 'Internet',
 'Marketing and Advertising',
 'Education Management',
 'Financial Services',
 'Hospital & Health Care',
 'Consumer Services',
 'Telecommunications',
 'Oil & Energy',
 'Retail',
 'Real Estate',
 'Accounting',
 'Construction',
 'E-Learning',
 'Management Consulting',
 'Design',
 'Staffing and Recruiting',
 'Health, Wellness and Fitness',
 'Insurance',
 'Automotive',
 'Logistics and Supply Chain',
 'Human Resources',
 'Online Media',
 'Apparel & Fashion',
 'Legal Services',
 'Facilities Services',
 'Hospitality',
 'Computer Games',
 'Banking',
 'Building Materials',
 'Leisure, Travel & Tourism',
 'Nonprofit Organization Management',
 'Entertainment',
 'Electrical/Electronic Manufacturing',
 'Food & Beverages',
 'Cosmetics',
 'Airlines/Aviation',
 'Consumer Goods',
 'Consumer Electronics',
 'Medical Practice',
 'Public Relations and Communications',
 'Civic & Social Organization',
 'Market Research',
 'Transportati

In [22]:
data['function'].value_counts()
# but whats the difference between this and industry
# "your scope of work" within the company.

Information Technology    1749
Sales                     1468
Engineering               1348
Customer Service          1229
Marketing                  830
Administrative             630
Design                     340
Health Care Provider       338
Other                      325
Education                  325
Management                 317
Business Development       228
Accounting/Auditing        212
Human Resources            205
Project Management         183
Finance                    172
Consulting                 144
Writing/Editing            132
Art/Creative               132
Production                 116
Product Management         114
Quality Assurance          111
Advertising                 90
Business Analyst            84
Data Analyst                82
Public Relations            76
Manufacturing               74
General Business            68
Research                    50
Legal                       47
Strategy/Planning           46
Training                    38
Supply C

# As of now think of drop columns department and salary ranges and maybe droping all other null values

In [23]:
data.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [26]:
data['description'][0]

'Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systemsResearching blogs and websites for the Provisions by Food52 Affiliate ProgramAssisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiriesSupporting with PR &amp; Events when neededHelping with office administrative work, such as filing, mailing, and preparing for meetingsWorking with developers to document bugs and suggest improvements to the siteSupporting the marketing and executive staff'

In [27]:
data['requirements'][0]

'Experience with content management systems a major plus (any blogging counts!)Familiar with the Food52 editorial voice and aestheticLoves food, appreciates the importance of home cooking and cooking with the seasonsMeticulous editor, perfectionist, obsessive attention to detail, maddened by typos and broken links, delighted by finding and fixing themCheerful under pressureExcellent communication skillsA+ multi-tasker and juggler of responsibilities big and smallInterested in and engaged with social media like Twitter, Facebook, and PinterestLoves problem-solving and collaborating to drive Food52 forwardThinks big picture but pitches in on the nitty gritty of running a small company (dishes, shopping, administrative support)Comfortable with the realities of working for a startup: being on call on evenings and weekends, and working long hours'

### Nulls in description n requirements

In [48]:
data[['description', 'requirements']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   description   17879 non-null  object
 1   requirements  15185 non-null  object
dtypes: object(2)
memory usage: 279.5+ KB


In [51]:
data = data.dropna(subset=['description', 'requirements'])

In [52]:
data.isna().sum()

job_id                     0
title                      0
location                 170
department              9469
salary_range           12573
company_profile         2554
description                0
requirements               0
benefits                4786
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         2509
required_experience     5151
required_education      5971
industry                3526
function                4607
fraudulent                 0
words                      0
dtype: int64

### combining description and requirements

In [56]:
data['words'] = data['description'] + ' ' + data['requirements']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['words'] = data['description'] + ' ' + data['requirements']


In [57]:
data.head(1)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,words
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,"Food52, a fast-growing, James Beard Award-winn..."


In [58]:
data['words'][0]

'Food52, a fast-growing, James Beard Award-winning online food community and crowd-sourced and curated recipe hub, is currently interviewing full- and part-time unpaid interns to work in a small team of editors, executives, and developers in its New York City headquarters.Reproducing and/or repackaging existing Food52 content for a number of partner sites, such as Huffington Post, Yahoo, Buzzfeed, and more in their various content management systemsResearching blogs and websites for the Provisions by Food52 Affiliate ProgramAssisting in day-to-day affiliate program support, such as screening affiliates and assisting in any affiliate inquiriesSupporting with PR &amp; Events when neededHelping with office administrative work, such as filing, mailing, and preparing for meetingsWorking with developers to document bugs and suggest improvements to the siteSupporting the marketing and executive staff Experience with content management systems a major plus (any blogging counts!)Familiar with t

### start NLP vectorizing

In [59]:
sw = stopwords.words('english')

In [60]:
data.words

0        Food52, a fast-growing, James Beard Award-winn...
1        Organised - Focused - Vibrant - Awesome!Do you...
2        Our client, located in Houston, is actively se...
3        THE COMPANY: ESRI – Environmental Systems Rese...
4        JOB TITLE: Itemization Review ManagerLOCATION:...
                               ...                        
17875    Just in case this is the first time you’ve vis...
17876    The Payroll Accountant will focus primarily on...
17877    Experienced Project Cost Control Staff Enginee...
17878    Nemsia Studios is looking for an experienced v...
17879    Who are we?Vend is an award winning web based ...
Name: words, Length: 15185, dtype: object

In [83]:
data['words'][45]

'As a Sales Representative, you will provide assistance to our customers as they purchase the materials and tools they need for a wide variety of roofing, siding, and window/door replacement projects. From the moment you greet customers until their sales have been finalized, you will provide them with the best in customer service and exterior building supply expertise.Your specific duties as a Sales Representative may include:Determining customers’ needs and recommending appropriate products and solutionsFollowing ABC’s product/supply checklist for each customer’s specific job and upselling additional products and supplies for that jobAnswering customer questions and offering product adviceOrdering products from other ABC Supply branches when necessaryAccepting payment and applying it to the appropriate customer accountArranging with the warehouse for customer product pickupFollowing-up on each delivery to ensure that shipment arrived on time with all items accounted forBalancing out c

### Adding spaces before capital letters - Need to make a function out if it 

In [115]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer



[NLP Source](https://my.learn.co/courses/468/assignments/25568?module_item_id=61343)

In [129]:
# Create an intance of the RegexpTokenizer with the variable name `tokenizer`
# The regex pattern should select all words with three or more characters
pattern = r"(?u)\w{3,}"
tokenizer = RegexpTokenizer(pattern)

# Create a list of stopwords in English
stopwords_list = stopwords.words('english')

# Create an instance of nltk's PorterStemmer with the variable name `stemmer`
stemmer = nltk.stem.PorterStemmer()

def capital_words_spaces(str1, tokenizer, stopwords_list, stemmer):
    space_words= re.sub(r"(\w)([A-Z])", r"\1 \2", str1)
    # Standardize case (lowercase the text)
    lowered = space_words.lower()

    # Tokenize text using `tokenizer`
    tokens = tokenizer.tokenize(lowered)

    # Remove stopwords using `stopwords_list`
    stopped_tokens = [word for word in tokens if word not in stopwords_list]

    # Stem the tokenized text using `stemmer`
    stems = [stemmer.stem(token) for token in stopped_tokens]

    # Return the preprocessed text
    return stems

In [130]:
data['words'][2]

['client',
 'locat',
 'houston',
 'activ',
 'seek',
 'experienc',
 'commiss',
 'machineri',
 'assist',
 'possess',
 'strong',
 'supervisori',
 'skill',
 'attent',
 'detail',
 'strong',
 'dedic',
 'safeti',
 'must',
 'ideal',
 'candid',
 'execut',
 'activ',
 'compli',
 'qualiti',
 'requir',
 'health',
 'environment',
 'safeti',
 'regul',
 'implement',
 'pre',
 'commiss',
 'commiss',
 'procedur',
 'rotari',
 'equip',
 'execut',
 'activ',
 'subcontractor',
 'assign',
 'crew',
 'pertain',
 'disciplin',
 'ensur',
 'effect',
 'util',
 'commiss',
 'manpow',
 'consum',
 'ensur',
 'execut',
 'vendor',
 'specialist',
 'field',
 'activ',
 'assign',
 'resourc',
 'sub',
 'contractor',
 'per',
 'vendor',
 'repres',
 'plan',
 'carri',
 'equip',
 'inspect',
 'client',
 'repres',
 'ensur',
 'proper',
 'certif',
 'produc',
 'prepar',
 'form',
 'pend',
 'test',
 'submit',
 'sign',
 'certif',
 'final',
 'hand',
 'certif',
 'engin',
 'coordin',
 'field',
 'vendor',
 'repres',
 'keep',
 'record',
 'activ',


In [143]:
data.reset_index(drop=True)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,words
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,"[food52, fast, grow, jame, beard, award, win, ..."
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,"[organis, focus, vibrant, awesom, passion, cus..."
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,"[client, locat, houston, activ, seek, experien..."
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,"[environment, system, research, institut, pass..."
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,"[item, review, manag, fort, worth, item, revie..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15180,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0,Just in case this is the first time you’ve vis...
15181,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,The Payroll Accountant will focus primarily on...
15182,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0,Experienced Project Cost Control Staff Enginee...
15183,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0,Nemsia Studios is looking for an experienced v...


In [131]:
for n,row in enumerate(data['words']):
    data['words'][n]=capital_words_spaces(row, tokenizer, stopwords_list, stemmer)

TypeError: expected string or bytes-like object

In [138]:
data['words'].head(20)

0     [food52, fast, grow, jame, beard, award, win, ...
1     [organis, focus, vibrant, awesom, passion, cus...
2     [client, locat, houston, activ, seek, experien...
3     [environment, system, research, institut, pass...
4     [item, review, manag, fort, worth, item, revie...
6     Your Responsibilities: Manage the English-spea...
7     Who is Airenvy?Hey there! We are seasoned entr...
8     Implementation/Configuration/Testing/Training ...
9     The Customer Service Associate will be based i...
10    Position : #URL_86fd830a95a64e2b30ceed829e63fd...
11    TransferWise is the clever new way to move mon...
12    The Applications Developer, Digital will devel...
13    Event Industry Installers Needed!! (Orlando, F...
14    Are you interested in a satisfying and financi...
15    About Vault Dragon Vault Dragon is Dropbox for...
16    We are looking for a Hands-On QA Leader for ou...
17    Government funding is only available for 16-18...
19    Experienced Process Controls Engineer is r

In [116]:
text = 'As a Sales Representative, you will provide assistance to our customers as they purchase the materials and tools they need for a wide variety of roofing, siding, and window/door replacement projects. From the moment you greet customers until their sales have been finalized, you will provide them with the best in customer service and exterior building supply expertise.Your specific duties as a Sales Representative may include:Determining customers’ needs and recommending appropriate products and solutionsFollowing ABC’s product/supply checklist for each customer’s specific job and upselling additional products and supplies for that jobAnswering customer questions and offering product adviceOrdering products from other ABC Supply branches when necessaryAccepting payment and applying it to the appropriate customer accountArranging with the warehouse for customer product pickupFollowing-up on each delivery to ensure that shipment arrived on time with all items accounted forBalancing out cash drawers and preparing bank deposits at the end of each dayArranging product displays and layouts to maximize effectivenessReordering products to keep the store and warehouse shelves well stockedGiving out comment cards to customersAddressing and resolving customer complaints when necessary As a Sales Representative, you must have the ability to provide superior customer service and to go the extra mile to determine and provide exactly what each customer needs. It helps if you have experience with exterior building supplies so that you are familiar with the various products and associated contractor terminology. As a Sales Representative, you will also need to be self-motivated with a strong work ethic and excellent attention to detail.Specific qualifications for the Sales Representative position include:1-2 years retail/customer service experience; experience with residential roofing and siding and windows (retail supply sales or site work with a crew) preferredStrong retail merchandising skills (including shelving and layout)Excellent verbal and written communication and interpersonal skillsSolid time management and prioritization skillsBasic computer skillsAbility to lift 50-75 pounds'
capital_words_spaces(text,tokenizer, stopwords_list, stemmer)

['sale',
 'repres',
 'provid',
 'assist',
 'custom',
 'purchas',
 'materi',
 'tool',
 'need',
 'wide',
 'varieti',
 'roof',
 'side',
 'window',
 'door',
 'replac',
 'project',
 'moment',
 'greet',
 'custom',
 'sale',
 'final',
 'provid',
 'best',
 'custom',
 'servic',
 'exterior',
 'build',
 'suppli',
 'expertis',
 'specif',
 'duti',
 'sale',
 'repres',
 'may',
 'includ',
 'determin',
 'custom',
 'need',
 'recommend',
 'appropri',
 'product',
 'solut',
 'follow',
 'product',
 'suppli',
 'checklist',
 'custom',
 'specif',
 'job',
 'upsel',
 'addit',
 'product',
 'suppli',
 'job',
 'answer',
 'custom',
 'question',
 'offer',
 'product',
 'advic',
 'order',
 'product',
 'suppli',
 'branch',
 'necessari',
 'accept',
 'payment',
 'appli',
 'appropri',
 'custom',
 'account',
 'arrang',
 'warehous',
 'custom',
 'product',
 'pickup',
 'follow',
 'deliveri',
 'ensur',
 'shipment',
 'arriv',
 'time',
 'item',
 'account',
 'balanc',
 'cash',
 'drawer',
 'prepar',
 'bank',
 'deposit',
 'end',
 'da

In [121]:
data.words[0]

['food52',
 'fast',
 'grow',
 'jame',
 'beard',
 'award',
 'win',
 'onlin',
 'food',
 'commun',
 'crowd',
 'sourc',
 'curat',
 'recip',
 'hub',
 'current',
 'interview',
 'full',
 'part',
 'time',
 'unpaid',
 'intern',
 'work',
 'small',
 'team',
 'editor',
 'execut',
 'develop',
 'new',
 'york',
 'citi',
 'headquart',
 'reproduc',
 'repackag',
 'exist',
 'food52',
 'content',
 'number',
 'partner',
 'site',
 'huffington',
 'post',
 'yahoo',
 'buzzfe',
 'variou',
 'content',
 'manag',
 'system',
 'research',
 'blog',
 'websit',
 'provis',
 'food52',
 'affili',
 'program',
 'assist',
 'day',
 'day',
 'affili',
 'program',
 'support',
 'screen',
 'affili',
 'assist',
 'affili',
 'inquiri',
 'support',
 'amp',
 'event',
 'need',
 'help',
 'offic',
 'administr',
 'work',
 'file',
 'mail',
 'prepar',
 'meet',
 'work',
 'develop',
 'document',
 'bug',
 'suggest',
 'improv',
 'site',
 'support',
 'market',
 'execut',
 'staff',
 'experi',
 'content',
 'manag',
 'system',
 'major',
 'plu',
 'bl

In [122]:
data['words']

0        [food52, fast, grow, jame, beard, award, win, ...
1        [organis, focus, vibrant, awesom, passion, cus...
2        [client, locat, houston, activ, seek, experien...
3        [environment, system, research, institut, pass...
4        [item, review, manag, fort, worth, item, revie...
                               ...                        
17875    Just in case this is the first time you’ve vis...
17876    The Payroll Accountant will focus primarily on...
17877    Experienced Project Cost Control Staff Enginee...
17878    Nemsia Studios is looking for an experienced v...
17879    Who are we?Vend is an award winning web based ...
Name: words, Length: 15185, dtype: object

In [114]:
capital_words_spaces('As a Sales Representative, you will provide assistance to our customers as they purchase the materials and tools they need for a wide variety of roofing, siding, and window/door replacement projects. From the moment you greet customers until their sales have been finalized, you will provide them with the best in customer service and exterior building supply expertise.Your specific duties as a Sales Representative may include:Determining customers’ needs and recommending appropriate products and solutionsFollowing ABC’s product/supply checklist for each customer’s specific job and upselling additional products and supplies for that jobAnswering customer questions and offering product adviceOrdering products from other ABC Supply branches when necessaryAccepting payment and applying it to the appropriate customer accountArranging with the warehouse for customer product pickupFollowing-up on each delivery to ensure that shipment arrived on time with all items accounted forBalancing out cash drawers and preparing bank deposits at the end of each dayArranging product displays and layouts to maximize effectivenessReordering products to keep the store and warehouse shelves well stockedGiving out comment cards to customersAddressing and resolving customer complaints when necessary As a Sales Representative, you must have the ability to provide superior customer service and to go the extra mile to determine and provide exactly what each customer needs. It helps if you have experience with exterior building supplies so that you are familiar with the various products and associated contractor terminology. As a Sales Representative, you will also need to be self-motivated with a strong work ethic and excellent attention to detail.Specific qualifications for the Sales Representative position include:1-2 years retail/customer service experience; experience with residential roofing and siding and windows (retail supply sales or site work with a crew) preferredStrong retail merchandising skills (including shelving and layout)Excellent verbal and written communication and interpersonal skillsSolid time management and prioritization skillsBasic computer skillsAbility to lift 50-75 pounds')


TypeError: capital_words_spaces() missing 3 required positional arguments: 'tokenizer', 'stopwords_list', and 'stemmer'

In [75]:
tf_vec1 = TfidfVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?,  r' \1', str)",
                         stop_words=sw)
X_vec1 = tf_vec1.fit_transform(data.words)  

df = pd.DataFrame(X_vec1.toarray(), columns=tf_vec1.get_feature_names())
df.head()

# Lindsey recommended the Count Vectorizer 

In [87]:
vec1 = CountVectorizer(token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)", 
                      stop_words=sw)

X_vec1 = vec1.fit_transform(data.words)

df1 = pd.DataFrame(X_vec1.toarray(), columns=vec1.get_feature_names())
df1.head()

Unnamed: 0,aa,aaa,aaab,aaabe,aaacdddaae,aaadd,aaae,aaaf,aab,aabc,...,zuora,zur,zurb,zurich,zus,zusammen,zusammenarbeitest,zweig,zyfax,zynga
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
data.loc[45].words

'As a Sales Representative, you will provide assistance to our customers as they purchase the materials and tools they need for a wide variety of roofing, siding, and window/door replacement projects. From the moment you greet customers until their sales have been finalized, you will provide them with the best in customer service and exterior building supply expertise.Your specific duties as a Sales Representative may include:Determining customers’ needs and recommending appropriate products and solutions Following A BC’s product/supply checklist for each customer’s specific job and upselling additional products and supplies for that job Answering customer questions and offering product advice Ordering products from other A BC Supply branches when necessary Accepting payment and applying it to the appropriate customer account Arranging with the warehouse for customer product pickup Following-up on each delivery to ensure that shipment arrived on time with all items accounted for Balanc

In [92]:
df1.loc[45].sort_values(ascending=False)

customer                         5
work                             5
service                          4
experience                       4
year                             3
                                ..
preferredproven                  0
preferredpublic                  0
preferredqualificationsproven    0
preferredquickbooks              0
aa                               0
Name: 45, Length: 80739, dtype: int64