In [18]:
from dotenv import load_dotenv
from os import getenv, walk
from sklearn.datasets import load_files
from sklearn.utils import Bunch

load_dotenv()

DATASET_BOOTSTRAP = getenv("DATASET_BOOTSTRAP")

# create a sklearn bunch from all files in the bootstrap dataset with the filename and data
bunch = load_files(DATASET_BOOTSTRAP, encoding="utf-8", decode_error="replace", shuffle=False)

# print(bunch.filenames)



## tfidf
https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/
### tfidftransformation

In [19]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

filesdata = bunch.data



CountVectorizer: count the number of words (term frequency), limit your vocabulary size, apply stop words and etc

In [20]:
#instantiate CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(filesdata)
word_count_vector.shape

(95, 21630)

In [21]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
qu,1.000000
pour,1.000000
je,1.000000
par,1.000000
ou,1.000000
...,...
grandement,4.871201
grandie,4.871201
grandiloquents,4.871201
goncourt,4.871201


In [22]:
# count matrix : gets the word counts for the documents in a sparse matrix form
# gets the word counts for the documents in a sparse matrix form.
# We could have actually used word_count_vector from above. 
# However, in practice, you may be computing tf-idf scores on a set of new unseen documents
count_vector=cv.transform(filesdata) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
de,0.337614
madelin,0.274098
un,0.187563
evoquant,0.182732
le,0.168807
...,...
dérive,0.000000
dérisoires,0.000000
dérisoirement,0.000000
dérisoire,0.000000


### tfidfvectorize
same results as tfidftranformer, but with less code

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(filesdata)
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names_out(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
de,0.337614
madelin,0.274098
un,0.187563
evoquant,0.182732
le,0.168807
...,...
dérive,0.000000
dérisoires,0.000000
dérisoirement,0.000000
dérisoire,0.000000
