In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample documents
documents = [
    "The sky is blue.",
    "The sun is bright.",
    "The sun in the sky is bright.",
    "We can see the shining sun, the bright sun."
]

In [2]:
documents

['The sky is blue.',
 'The sun is bright.',
 'The sun in the sky is bright.',
 'We can see the shining sun, the bright sun.']

In [3]:
# Function to generate Bag of Words (BoW) representation
def bag_of_words(documents):
    # Create an instance of CountVectorizer
    # This converts a collection of text documents to a matrix of token counts
    vectorizer = CountVectorizer()

    # Fit the model and transform the documents into a matrix
    # The fit_transform method learns the vocabulary and returns a document-term matrix
    X = vectorizer.fit_transform(documents)

    # Get the feature names (words) corresponding to each column in the matrix
    feature_names = vectorizer.get_feature_names_out()

    # Print the matrix in array form (each row corresponds to a document and each column corresponds to a word)
    print("Bag of Words Representation:\n", X.toarray())

    # Print the feature names (the words in the vocabulary)
    print("Feature Names:\n", feature_names)

    # Return the document-term matrix and the feature names
    return X, feature_names


In [4]:
# Run the Bag of Words function to get the matrix and feature names
bow_matrix, bow_feature_names = bag_of_words(documents)

Bag of Words Representation:
 [[1 0 0 0 1 0 0 1 0 1 0]
 [0 1 0 0 1 0 0 0 1 1 0]
 [0 1 0 1 1 0 0 1 1 2 0]
 [0 1 1 0 0 1 1 0 2 2 1]]
Feature Names:
 ['blue' 'bright' 'can' 'in' 'is' 'see' 'shining' 'sky' 'sun' 'the' 'we']


In [5]:
# Function to generate TF-IDF representation
def tf_idf(documents):
    # Create an instance of TfidfVectorizer
    # This converts a collection of text documents to a matrix of TF-IDF features
    vectorizer = TfidfVectorizer()

    # Fit the model and transform the documents into a matrix
    # The fit_transform method learns the vocabulary and returns a term-document matrix with TF-IDF values
    X = vectorizer.fit_transform(documents)

    # Get the feature names (words) corresponding to each column in the matrix
    feature_names = vectorizer.get_feature_names_out()

    # Print the matrix in array form (each row corresponds to a document and each column corresponds to a word)
    print("TF-IDF Representation:\n", X.toarray())

    # Print the feature names (the words in the vocabulary)
    print("Feature Names:\n", feature_names)

    # Return the term-document matrix and the feature names
    return X, feature_names

In [6]:
# Run the TF-IDF function to get the matrix and feature names
tfidf_matrix, tfidf_feature_names = tf_idf(documents)

TF-IDF Representation:
 [[0.65919112 0.         0.         0.         0.42075315 0.
  0.         0.51971385 0.         0.34399327 0.        ]
 [0.         0.52210862 0.         0.         0.52210862 0.
  0.         0.         0.52210862 0.42685801 0.        ]
 [0.         0.3218464  0.         0.50423458 0.3218464  0.
  0.         0.39754433 0.3218464  0.52626104 0.        ]
 [0.         0.23910199 0.37459947 0.         0.         0.37459947
  0.37459947 0.         0.47820398 0.39096309 0.37459947]]
Feature Names:
 ['blue' 'bright' 'can' 'in' 'is' 'see' 'shining' 'sky' 'sun' 'the' 'we']
