Reading Text Files:

In [10]:
with open('One.txt') as mytext:
    a = mytext.read()

In [6]:
a

'This is a story about dogs\nour canine pets\nDogs are furry animals\n'

In [7]:
print(a)

This is a story about dogs
our canine pets
Dogs are furry animals



In [8]:
a.lower().split() #Extracting the words from One.txt

['this',
 'is',
 'a',
 'story',
 'about',
 'dogs',
 'our',
 'canine',
 'pets',
 'dogs',
 'are',
 'furry',
 'animals']

Obtaining Vocabularies From Text Files Manually:

In [None]:
with open('One.txt') as mytext:
    words_one = mytext.read().lower().split()
    uni_words_one = set(words_one)
    #The set function returns a list of the unique words in a string as a set (a special data structure) in alphabetical order

In [13]:
uni_words_one

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'dogs',
 'furry',
 'is',
 'our',
 'pets',
 'story',
 'this'}

In [16]:
with open('Two.txt') as mytext:
    words_two = mytext.read().lower().split()
    uni_words_two = set(words_two)

In [17]:
uni_words_two

{'a',
 'about',
 'catching',
 'fun',
 'is',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [None]:
all_uni_words = set()
all_uni_words.update(uni_words_one)
all_uni_words.update(uni_words_two)
#The update function modifies dictionaries and sets
#For sets, it adds elements from iterables (such as lists), ignoring duplicates

In [19]:
all_uni_words

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'catching',
 'dogs',
 'fun',
 'furry',
 'is',
 'our',
 'pets',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [20]:
full_vocab = dict()
i = 0

for word in all_uni_words:
    full_vocab[word] = i
    i = i + 1
#Assigning indeces to every word as a dictionary

In [None]:
full_vocab #Note: Sets aren't actually sorted or ordered, so Python performs the above loop in no particular order (usually the most efficient order)

{'our': 0,
 'furry': 1,
 'fun': 2,
 'pets': 3,
 'a': 4,
 'animals': 5,
 'dogs': 6,
 'this': 7,
 'sport': 8,
 'water': 9,
 'popular': 10,
 'about': 11,
 'canine': 12,
 'waves': 13,
 'are': 14,
 'is': 15,
 'story': 16,
 'catching': 17,
 'surfing': 18}

Calculating Word Frequencies In Each File:

In [22]:
one_freq = [0] * len(full_vocab) #Initializing all zeroes to start
two_freq = [0] * len(full_vocab)
all_words = [''] * len(full_vocab)

In [23]:
one_freq

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [24]:
two_freq

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [25]:
with open('One.txt') as f:
    one_text = f.read().lower().split()

In [26]:
for word in one_text:
    word_ind = full_vocab[word] #Looking up the word's index in the vocabulary dictionary
    one_freq[word_ind] += 1 #Add 1 to the frequency of the word at the index defined above in one_freq

In [30]:
one_freq

[1, 1, 0, 1, 1, 1, 2, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0]

In [27]:
with open('Two.txt') as f:
    two_text = f.read().lower().split()

In [28]:
two_text

['this',
 'story',
 'is',
 'about',
 'surfing',
 'catching',
 'waves',
 'is',
 'fun',
 'surfing',
 'is',
 'a',
 'popular',
 'water',
 'sport']

In [29]:
for word in two_text: #Same as previous loop, but for two_freq with two_text
    word_ind = full_vocab[word]
    two_freq[word_ind] += 1

In [31]:
two_freq

[0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 3, 1, 1, 2]

In [32]:
all_words

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']

In [34]:
for word in full_vocab: #Filling out the all_words list
    word_ind = full_vocab[word]
    all_words[word_ind] = word

In [35]:
all_words

['our',
 'furry',
 'fun',
 'pets',
 'a',
 'animals',
 'dogs',
 'this',
 'sport',
 'water',
 'popular',
 'about',
 'canine',
 'waves',
 'are',
 'is',
 'story',
 'catching',
 'surfing']

In [None]:
import pandas as pd
bow = pd.DataFrame(data = [one_freq, two_freq], columns = all_words)
#Creating the complete 'bag of words' as a dataframe, which counts each word in both documents

In [38]:
bow

Unnamed: 0,our,furry,fun,pets,a,animals,dogs,this,sport,water,popular,about,canine,waves,are,is,story,catching,surfing
0,1,1,0,1,1,1,2,1,0,0,0,1,1,0,1,1,1,0,0
1,0,0,1,0,1,0,0,1,1,1,1,1,0,1,0,3,1,1,2


Extracting Features From Text Using Scikit-Learn:

In [39]:
text = ['This is a line', 'This is another line', 'Completely different line']

Counting Word Frequency With CountVectorizer:

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [42]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |  
 |  Convert a collection of text documents to a matrix of token counts.
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  For an efficiency comparison of the different feature extractors, see
 |  :ref:`sphx_glr_au

In [43]:
cv = CountVectorizer() #Creating an instance of the CountVectorizer object

In [None]:
cv.fit_transform(text) #Similar to StandardScaler
#This will count each list entry as an individual document
#Notice that this is stored as a sparse matrix; the first number, 3, is the number of lines (documents)
#The second number, 6, is the number of unique words whose frequencies were counted

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 10 stored elements and shape (3, 6)>

In [48]:
sparse_matrix = cv.fit_transform(text)

In [None]:
sparse_matrix.todense() #The todense call expands out the matrix to the actual size specified by the two numbers

matrix([[0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 1, 1],
        [0, 1, 1, 0, 1, 0]])

In [None]:
cv.vocabulary_

{'this': 5, 'is': 3, 'line': 4, 'another': 0, 'completely': 1, 'different': 2}

In [None]:
cv = CountVectorizer(stop_words = 'english')
#The stop_words parameter removes very common words, such as 'a', 'the', 'this', 'is', etc. using a selected language
#You can also pass in a custom list of stop words to exclude

In [52]:
cv.fit_transform(text)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 5 stored elements and shape (3, 3)>

In [53]:
sparse_matrix = cv.fit_transform(text)

In [54]:
sparse_matrix.todense()

matrix([[0, 0, 1],
        [0, 0, 1],
        [1, 1, 1]])

In [55]:
cv.vocabulary_

{'line': 2, 'completely': 0, 'different': 1}

The Term Frequency - Inverse Document Frequency Transformer:

In [58]:
cv = CountVectorizer() #Reverting the CountVectorizer to all words
sparse_matrix = cv.fit_transform(text)

In [56]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf = TfidfTransformer() #This transforms a 'bag of words' into a TF-IDF frequency counter

In [59]:
results = tfidf.fit_transform(sparse_matrix)

In [60]:
results

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (3, 6)>

In [61]:
results.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])

All In One Step With TfidfVectorizer:

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
tv = TfidfVectorizer()

In [64]:
tv_results = tv.fit_transform(text)

In [65]:
tv_results

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (3, 6)>

In [66]:
tv_results.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])