In [30]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as sk_text

# CountVectorizer implements both tokenization and occurrence counting in a single class:
vectorizer = sk_text.CountVectorizer(min_df=1)

corpus = ['This is the first document.',
           'this is the second second document.',
           'And the third one.',
           'Is this the first first first document?',
          ]

matrix = vectorizer.fit_transform(corpus)

# convert to numpy array
matrix.toarray()

# view all unique strings found in all string arrays
print(vectorizer.get_feature_names())

# TfidfVectorizer combines all the options of CountVectorizer and TfidfTransformer in a single model:
# TfIdfVectorizer: transforms text into a "sparse matrix" where rows are text and columns are words, and values are the tf-dif values.
vectorizer = sk_text.TfidfVectorizer(max_features=1000, min_df=1)
matrix = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())


vectorizer = sk_text.TfidfVectorizer(min_df=4, max_df=1000)
vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())




['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
['the']


In [37]:
# EXAMPLE 2

document_0 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_1 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_2 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_3 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_4 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?"


corpus = [document_0, document_1, document_2, document_3, document_4]

vectorizer = sk_text.TfidfVectorizer(stop_words='english',
                             max_features = 100,
                             min_df=1, 
                             )

matrix = vectorizer.fit_transform(corpus)

tfidf_data = matrix.toarray()     #  convert it to numpy array

print(tfidf_data)
print(tfidf_data.shape)
print(vectorizer.get_feature_names())
print(len(vectorizer.get_feature_names()))

[[0.23684538 0.         0.         0.         0.29356375 0.
  0.         0.         0.         0.         0.29356375 0.
  0.         0.         0.23684538 0.         0.29356375 0.
  0.         0.         0.29356375 0.29356375 0.         0.29356375
  0.29356375 0.         0.         0.         0.         0.
  0.         0.23684538 0.         0.         0.29356375 0.
  0.29356375 0.         0.         0.23684538]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.29954105
  0.         0.37127341 0.         0.37127341 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.29954105 0.         0.29954105 0.37127341 0.
  0.         0.         0.         0.37127341 0.         0.
  0.         0.         0.29954105 0.29954105]
 [0.36252618 0.44934185 0.         0.44934185 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.36252618 0.         0.  