In [None]:
#reference: https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/

import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# local variables
sample_data_path = 'data/sample_data/'

# user data
user = pd.read_csv(sample_data_path + 'users_sample.csv')
# jobs data
jobs = pd.read_csv(sample_data_path + 'jobs_sample_cleared.csv')
# user history data 
user_history = pd.read_csv(sample_data_path + 'user_history_sample.csv')

In [None]:
jobs.head()

## Preprocess Combine Column that u need to a single vector

In [None]:
#Combine the column u need (adding city and state increase the word size to 180,000 )
documents = []
for i in range (len(jobs)):
    text = "{} {} {} {} {}".format(jobs.loc[i, 'Title'],
                                   jobs.loc[i, 'Description'],
                                   jobs.loc[i, 'Requirements'],
                                   jobs.loc[i, 'State'],
                                   jobs.loc[i, 'City'])
    documents.append(text)
documents

# Guide Line  (Tfidftransformer  vs Tfidfvectorizer)
#### 1. If you need to compute tf-idf scores on documents outside your “training” dataset, use either one, both will work.
#### 2. If you need the term frequency (term count) vectors for different tasks, use Tfidftransformer.
#### 3. If you need to compute tf-idf scores on documents within your “training” dataset, use Tfidfvectorizer

# First Method :  Tfidftransformer 
### 1. Initialize CountVectorizer

In [None]:
#instantiate CountVectorizer()
cv = CountVectorizer()
 
# this steps generates word counts for the words in your docs
word_count_vector = cv.fit_transform(documents)

In [None]:
word_count_vector.shape

### 2. Compute the IDF values

In [None]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

#####  The lower the IDF value of a word, the less unique it is to any particular document.

In [None]:
# print idf values
df = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df.sort_values(by=['idf_weights'])

### 3. Compute the TFIDF score for your documents

In [None]:
# count matrix
count_vector = cv.transform(documents)
 
# tf-idf scores
tf_idf_vector = tfidf_transformer.transform(count_vector)

#### The more common the word across documents, the lower its score and the more unique a word is to our first document

In [None]:
feature_names = cv.get_feature_names()
 
# get tfidf vector for first document
first_document_vector = tf_idf_vector[0]
 
# print the scores
tfdf_1 = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
sorted_tfdf_1 = tfdf_1.sort_values(by=["tfidf"],ascending=False)
sorted_tfdf_1

In [None]:
#the number of importance word
valid_tfdf_1 = sorted_tfdf_1[sorted_tfdf_1['tfidf'] > 0]
valid_tfdf_1

In [None]:
# Export to CSV use if needed

# Tfidftranformer
valid_tfdf_1.to_csv(sample_data_path + 'tfidf1_jd.csv')

# Second Method: Tfidfvectorizer 

### 1.Initialize

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
 
# just send in all your docs here
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(documents)

### 2. print tfdf

In [None]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
tfdf_2 = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
sorted_tfdf_2 = tfdf_2.sort_values(by=["tfidf"],ascending=False)
sorted_tfdf_2

In [None]:
valid_tfdf_2 = sorted_tfdf_2[sorted_tfdf_2['tfidf'] > 0]
valid_tfdf_2

### Removing stopwords

We want to remove stopwords from our dataset, so we can focus on words with meaning. 

In [None]:
# settings that you use for count vectorizer will go here
tfidf_vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
 
# just send in all your docs here
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(documents)

In [None]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
tfdf_2 = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
sorted_tfdf_2 = tfdf_2.sort_values(by=["tfidf"],ascending=False)
sorted_tfdf_2

In [None]:
meaningful_tfdf_2 = sorted_tfdf_2[sorted_tfdf_2['tfidf'] > 0]
meaningful_tfdf_2

In [None]:
# Export to CSV use if needed

#Tfidfvectorizer
meaningful_tfdf_2.to_csv(sample_data_path + 'tfidf2_jd.csv')