# TF-IDF

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def tfidf(sentences, verbose=1):
    """
    Convert a list of sentences into a TF-IDF representation.
    
    Parameters:
    sentences (list of str): A list of sentences to vectorize using TF-IDF.
    verbose (int): If 1, prints the number of documents and words, and displays the DataFrame. If 0, silent.
    
    Returns:
    tfidf_vectorizer (vectorizer): The scikit-learn vectorizer fitted on the input sentences.
    tfidf_vect_sentences (ndarray): A 2D array where each row represents a document with TF-IDF weights.
    tfidf_feature_names (ndarray): An array of feature names (words).
    """
    tfidf_vectorizer = TfidfVectorizer()
    features = tfidf_vectorizer.fit_transform(sentences)
    tfidf_sentences = tfidf_vectorizer.fit_transform(sentences)
    tfidf_vect_sentences = tfidf_sentences.toarray()
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
    if verbose == 1:
        df = pd.DataFrame(tfidf_vect_sentences, columns=tfidf_feature_names)
        print(f"Num of Documents: {features.shape[0]}")
        print(f"Num of Words: {features.shape[1]}")
        print()
        print(df)
    elif verbose == 0:
        pass
    return tfidf_vectorizer,tfidf_vect_sentences,tfidf_feature_names

sentences = ['I love my dog.',
             'I love my cat.',
             'I love my dog and love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?']

tfidf_vectorizer, tfidf_vect_sentences,tfidf_feature_names = tfidf(sentences, verbose=1)

Num of Documents: 5
Num of Words: 10

    amazing       and       cat        do       dog        is      love  \
0  0.000000  0.000000  0.000000  0.000000  0.606856  0.000000  0.606856   
1  0.000000  0.000000  0.737922  0.000000  0.000000  0.000000  0.515290   
2  0.000000  0.491109  0.396224  0.000000  0.276682  0.000000  0.553364   
3  0.000000  0.000000  0.000000  0.000000  0.458054  0.000000  0.458054   
4  0.438724  0.000000  0.000000  0.438724  0.247170  0.438724  0.000000   

         my     think       you  
0  0.513275  0.000000  0.000000  
1  0.435829  0.000000  0.000000  
2  0.468032  0.000000  0.000000  
3  0.387419  0.000000  0.655957  
4  0.209054  0.438724  0.353960  
