In [1]:
#reference: https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/

import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# local variables
sample_data_path = 'data/sample_data/'

# user data
user = pd.read_csv(sample_data_path + 'users_sample.csv')
# jobs data
jobs = pd.read_csv(sample_data_path + 'jobs_sample_cleared.csv')
# user history data 
user_history = pd.read_csv(sample_data_path + 'user_history_sample.csv')

In [2]:
jobs.head()

Unnamed: 0.1,Unnamed: 0,JobID,Title,Description,Requirements,State,City
0,0,1,Security Engineer/Technical Lead,Security Clearance Required: Top Secret Job Nu...,SKILL SETNetwork Security tools:Webdefend Web ...,DC,Washington
1,1,4,SAP Business Analyst / WM,NO Corp. to Corp resumesare being considered f...,WHAT YOU NEED: Four year college degreeMinimum...,NC,Charlotte
2,2,7,P/T HUMAN RESOURCES ASSISTANT,P/T HUMAN RESOURCES ASSISTANT —— 1-2 y...,Please refer to the Job Description to view th...,FL,Winter Park
3,3,8,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,FL,Orlando
4,4,9,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,FL,Orlando


## Preprocess Combine Column that u need to a single vector

In [3]:
#Combine the column u need (adding city and state increase the word size to 180,000 )
documents = []
for i in range (len(jobs)):
    text = "{} {} {} {} {}".format(jobs.loc[i, 'Title'],
                                   jobs.loc[i, 'Description'],
                                   jobs.loc[i, 'Requirements'],
                                   jobs.loc[i, 'State'],
                                   jobs.loc[i, 'City'])
    documents.append(text)
documents

['Security Engineer/Technical Lead Security Clearance Required: Top Secret Job Number: TMR-447Location of Job: Washington, DCTMR, Inc. is an Equal Employment Opportunity CompanyFor more job opportunities with TMR, visit our website www.tmrhq.comSend Resumes to HR@tmrhq2.comJOB SUMMARY:Leads the customers overall Cyber Security strategy, formalizes service offerings consisted with ITIL best practices, and provides design and architecture support.    Provide security design / architecture support for OJPs IT Security Division (ITSD)     Leads the SECOPS team in the day to day OJP Security Operations support     Provides direction when needed in a security incident or technical issues     Works in concert with network operations on design /integration for best security posture    Supports business development functions including Capture Management, Proposal Development and responses, and other initiatives to include conferences, trade shows, webinars, developing white papers and the like.

# Guide Line  (Tfidftransformer  vs Tfidfvectorizer)
#### 1. If you need to compute tf-idf scores on documents outside your “training” dataset, use either one, both will work.
#### 2. If you need the term frequency (term count) vectors for different tasks, use Tfidftransformer.
#### 3. If you need to compute tf-idf scores on documents within your “training” dataset, use Tfidfvectorizer

# First Method :  Tfidftransformer 
### 1. Initialize CountVectorizer

In [4]:
#instantiate CountVectorizer()
cv = CountVectorizer()
 
# this steps generates word counts for the words in your docs
word_count_vector = cv.fit_transform(documents)

In [5]:
word_count_vector.shape

(65568, 168142)

### 2. Compute the IDF values

In [6]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

#####  The lower the IDF value of a word, the less unique it is to any particular document.

In [7]:
# print idf values
df = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
to,1.023330
and,1.029971
the,1.048016
in,1.056404
for,1.061938
...,...
inbilling,11.397711
inbatch,11.397711
inbaltimore,11.397711
inburlington,11.397711


### 3. Compute the TFIDF score for your documents

In [8]:
# count matrix
count_vector = cv.transform(documents)
 
# tf-idf scores
tf_idf_vector = tfidf_transformer.transform(count_vector)

#### The more common the word across documents, the lower its score and the more unique a word is to our first document

In [9]:
feature_names = cv.get_feature_names()
 
# get tfidf vector for first document
first_document_vector = tf_idf_vector[0]
 
# print the scores
tfdf_1 = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
sorted_tfdf_1 = tfdf_1.sort_values(by=["tfidf"],ascending=False)
sorted_tfdf_1

Unnamed: 0,tfidf
security,0.324590
tmr,0.301844
firewall,0.241845
vulnerability,0.195960
scanner,0.188402
...,...
documentationhandle,0.000000
documentationhas,0.000000
documentationheavy,0.000000
documentationhelp,0.000000


In [10]:
#the number of importance word
valid_tfdf_1 = sorted_tfdf_1[sorted_tfdf_1['tfidf'] > 0]
valid_tfdf_1

Unnamed: 0,tfidf
security,0.324590
tmr,0.301844
firewall,0.241845
vulnerability,0.195960
scanner,0.188402
...,...
skills,0.013206
an,0.012293
on,0.012077
is,0.010634


In [11]:
# Export to CSV use if needed

# Tfidftranformer
valid_tfdf_1.to_csv(sample_data_path + 'tfidf1_jd.csv')

# Second Method: Tfidfvectorizer 

### 1.Initialize

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(documents)

### 2. print tfdf

In [13]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
tfdf_2 = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
sorted_tfdf_2 = tfdf_2.sort_values(by=["tfidf"],ascending=False)
sorted_tfdf_2

Unnamed: 0,tfidf
security,0.324590
tmr,0.301844
firewall,0.241845
vulnerability,0.195960
scanner,0.188402
...,...
documentationhandle,0.000000
documentationhas,0.000000
documentationheavy,0.000000
documentationhelp,0.000000


In [14]:
valid_tfdf_2 = sorted_tfdf_2[sorted_tfdf_2['tfidf'] > 0]
valid_tfdf_2

Unnamed: 0,tfidf
security,0.324590
tmr,0.301844
firewall,0.241845
vulnerability,0.195960
scanner,0.188402
...,...
skills,0.013206
an,0.012293
on,0.012077
is,0.010634


### Removing stopwords

We want to remove stopwords from our dataset, so we can focus on words with meaning. 

In [15]:
# settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
 
# just send in all your docs here
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(documents)

In [16]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
tfdf_2 = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
sorted_tfdf_2 = tfdf_2.sort_values(by=["tfidf"],ascending=False)
sorted_tfdf_2

Unnamed: 0,tfidf
security,0.329906
tmr,0.306787
firewall,0.245806
vulnerability,0.199170
scanner,0.191488
...,...
documentationcode,0.000000
documentationcollaborate,0.000000
documentationcommunicate,0.000000
documentationcompany,0.000000


In [17]:
meaningful_tfdf_2 = sorted_tfdf_2[sorted_tfdf_2['tfidf'] > 0]
meaningful_tfdf_2

Unnamed: 0,tfidf
security,0.329906
tmr,0.306787
firewall,0.245806
vulnerability,0.199170
scanner,0.191488
...,...
opportunity,0.015548
required,0.015281
team,0.014954
skills,0.013422


In [18]:
# Export to CSV use if needed

#Tfidfvectorizer
meaningful_tfdf_2.to_csv(sample_data_path + 'tfidf2_jd.csv')