In [1]:
##### manipulating text data

In [2]:
# https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [3]:
import numpy as np
import pandas as pd

In [4]:
# doc corpus (from BB)
corpus = ["This is a brown house. This house is big. The street number is 1.",
          "This is a small house. This house has 1 bedroom. The street number is 12.",
          "This dog is brown. This dog likes to play.",
          "The dog is in the bedroom."]
corpus

['This is a brown house. This house is big. The street number is 1.',
 'This is a small house. This house has 1 bedroom. The street number is 12.',
 'This dog is brown. This dog likes to play.',
 'The dog is in the bedroom.']

In [9]:
# Word Occurence 

# - max_df (default 1): 
#             When building the vocabulary ignore terms that have a document frequency 
#             strictly higher than the given threshold (corpus-specific stop words). 
# - min_df (default 1): 
#           When building the vocabulary ignore terms that have a document frequency 
#           strictly lower than the given threshold.
# - max_features (default None) : 
#             If not None, build a vocabulary that only consider the top max_features 
#            ordered by term frequency across the corpus.
# - token_pattern (default r"\b\w\w+\b"): 
#            Regular expression denoting what constitutes a “token”, only used if analyzer == 'word' (default);
#            the default regexp select tokens of 2 or more alphanumeric characters.
#            Note: punctuation is completely ignored and always treated as a token separator.
# - ngram_range (default (1,1)):
#            The lower and upper boundary of the range of n-values for different n-grams to be extracted. 
#            All values of n such that min_n <= n <= max_n will be used.

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(
    lowercase=True, 
    stop_words='english', 
    min_df=1, max_df=len(corpus)+1, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(cv.fit_transform(corpus).toarray(), columns=cv.get_feature_names())
print(cv.vocabulary_)
print(cv.stop_words_)
df

{'brown': 3, 'house': 5, 'big': 2, 'street': 10, 'number': 7, 'small': 9, 'bedroom': 1, '12': 0, 'dog': 4, 'likes': 6, 'play': 8}
set()


Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0,0,1,1,0,2,0,1,0,0,1
1,1,1,0,0,0,2,0,1,0,1,1
2,0,0,0,1,2,0,1,0,1,0,0
3,0,1,0,0,1,0,0,0,0,0,0


In [10]:
# Normalized Word Occurence

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    use_idf=False, norm='l2',
    lowercase=True, 
    stop_words='english', 
    min_df=1, max_df=len(corpus)+1, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.0,0.353553,0.353553,0.0,0.707107,0.0,0.353553,0.0,0.0,0.353553
1,0.333333,0.333333,0.0,0.0,0.0,0.666667,0.0,0.333333,0.0,0.333333,0.333333
2,0.0,0.0,0.0,0.377964,0.755929,0.0,0.377964,0.0,0.377964,0.0,0.0
3,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Tf-Idf

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(use_idf=False, norm='l2',
                    lowercase=True, stop_words='english', min_df=1, max_df=len(corpus)+1, max_features=None, ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

Unnamed: 0,12,bedroom,big,brown,dog,house,likes,number,play,small,street
0,0.0,0.0,0.353553,0.353553,0.0,0.707107,0.0,0.353553,0.0,0.0,0.353553
1,0.333333,0.333333,0.0,0.0,0.0,0.666667,0.0,0.333333,0.0,0.333333,0.333333
2,0.0,0.0,0.0,0.377964,0.755929,0.0,0.377964,0.0,0.377964,0.0,0.0
3,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# doc corpus (from BB)

document1 = """In Greek mythology, Python (Greek: Πύθων, gen.: Πύθωνος) was the earth-dragon of 
Delphi, always represented in Greek sculpture and vase-paintings as a serpent. He presided at the 
Delphic oracle, which existed in the cult center for his mother, Gaia, "Earth," Pytho being the 
place name that was substituted for the earlier Krisa.[1] Hellenes considered the site to be the 
center of the earth, represented by a stone, the omphalos or navel, which Python guarded."""

document2 = """Monty Python (sometimes known as The Pythons)[2][3] were a British surreal comedy 
group who created the sketch comedy show Monty Python's Flying Circus, that first aired on the BBC on 
October 5, 1969. Forty-five episodes were made over four series. The Python phenomenon developed from 
the television series into something larger in scope and impact, spawning touring stage shows, films, 
numerous albums, several books, and a stage musical. The group's influence on comedy has been compared 
to The Beatles' influence on music."""

document3 = """Python is a widely used general-purpose, high-level programming language.[19][20] 
Its design philosophy emphasizes code readability, and its syntax allows programmers to express 
concepts in fewer lines of code than would be possible in languages such as C++ or Java.[21][22] 
The language provides constructs intended to enable clear programs on both a small and large scale."""

corpus = [document1, document2, document3]


from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    use_idf=True, smooth_idf=True, norm='l2',
    lowercase=True, 
    stop_words='english', 
    min_df=1, max_df=len(corpus)+1, max_features=None, 
    ngram_range=(1, 1))
df = pd.DataFrame(tv.fit_transform(corpus).toarray(), columns=tv.get_feature_names())
df

Unnamed: 0,19,1969,20,21,22,aired,albums,allows,bbc,beatles,...,substituted,surreal,syntax,television,touring,used,vase,widely,πύθων,πύθωνος
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.133161,0.0,0.0,0.0,0.0,0.0,0.133161,0.0,0.133161,0.133161
1,0.0,0.126858,0.0,0.0,0.0,0.126858,0.126858,0.0,0.126858,0.126858,...,0.0,0.126858,0.0,0.126858,0.126858,0.0,0.0,0.0,0.0,0.0
2,0.153667,0.0,0.153667,0.153667,0.153667,0.0,0.0,0.153667,0.0,0.0,...,0.0,0.0,0.153667,0.0,0.0,0.153667,0.0,0.153667,0.0,0.0


In [45]:
print ("\ndocument 0:")
print (df.loc[0].sort_values(ascending=False)[:5])

print ("\ndocument 1:")
print (df.loc[1].sort_values(ascending=False)[:5])

print ("\ndocument 2:")
print (df.loc[2].sort_values(ascending=False)[:5])


document 0:
greek          0.399484
earth          0.399484
represented    0.266323
center         0.266323
python         0.157295
Name: 0, dtype: float64

document 1:
comedy       0.380573
group        0.253715
stage        0.253715
series       0.253715
influence    0.253715
Name: 1, dtype: float64

document 2:
language       0.307333
code           0.307333
general        0.153667
programming    0.153667
programmers    0.153667
Name: 2, dtype: float64
