## Representing documents as a bag of words

In [None]:
# usual imports
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt')
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# helper functions
def read_text_file(filename):
    file = open(filename, "r", encoding="utf-8") 
    return file.read()

def preprocess_text(text):
    text = text.replace("\n", " ")
    return text

def divide_into_sentences_nltk(text):
    sentences = tokenizer.tokenize(text)
    return sentences

In [None]:
# extract the sentences from the sample text
def get_sentences(filename):
    sample_text = read_text_file(filename)
    sample_text = preprocess_text(sample_text)
    sentences = divide_into_sentences_nltk(sample_text)
    return sentences

In [None]:
# upload NLP_syllabus_blog_post.txt to colab
from google.colab import files
uploaded = files.upload()

Saving NLP_syllabus_blog_post.txt to NLP_syllabus_blog_post (1).txt


In [None]:
sentences = get_sentences("NLP_syllabus_blog_post.txt")
sentences

['I am currently starting work on developing an undergraduate module in Natural Language Processing (level 6, 3rd year).',
 'Although I have been involved in the field of NLP for many years, recent times have witnessed a transformation of the field, not just in terms of its academic foundations, but also its practical application in industry and its attractiveness as a fulfilling and rewarding career choice.',
 'My sense is that some of the topics which I originally studied for my doctorate retain their appeal since the key ideas remain relevant despite radical changes in the implementation.',
 'However, others are more hostage to the technological fortunes of deep learning and other neural/distributional approaches.',
 'My view is that field benefits by being informed by more than one perspective: computer/data science may be a given, but cognitive science, information science and linguistics all have their contributions to make.',
 'Clearly, it is a tricky task to pack all this into 

In [None]:
# create a countVectorizer and fit it to the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sentences)

In [None]:
# print the vector: a tuple of docID, termID, and count
# e.g. in doc 0 we have term 6 appearing once
print(X)

  (0, 6)	1
  (0, 30)	1
  (0, 97)	1
  (0, 118)	1
  (0, 77)	1
  (0, 34)	1
  (0, 7)	1
  (0, 114)	1
  (0, 69)	1
  (0, 51)	1
  (0, 72)	1
  (0, 62)	1
  (0, 85)	1
  (0, 64)	1
  (0, 1)	1
  (0, 119)	1
  (1, 51)	3
  (1, 5)	1
  (1, 45)	2
  (1, 16)	1
  (1, 56)	1
  (1, 104)	2
  (1, 38)	2
  (1, 76)	3
  (1, 74)	1
  :	:
  (5, 84)	1
  (5, 8)	2
  (5, 57)	1
  (5, 111)	1
  (5, 110)	2
  (5, 83)	1
  (5, 3)	1
  (5, 25)	1
  (5, 58)	1
  (5, 113)	1
  (5, 99)	1
  (5, 82)	1
  (5, 108)	1
  (5, 55)	1
  (5, 0)	1
  (5, 36)	1
  (5, 95)	1
  (5, 42)	1
  (5, 19)	1
  (5, 106)	1
  (6, 71)	1
  (6, 57)	1
  (6, 46)	1
  (6, 29)	1
  (6, 107)	1


In [None]:
type(X)

scipy.sparse.csr.csr_matrix

In [None]:
# convert to a dense matrix
denseX = X.todense()

In [None]:
type(denseX)

numpy.matrix

In [None]:
denseX

matrix([[0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0],
        [0, 0, 1, 0, 1, 1, 0, 0, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
         0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1,
         0, 1, 0, 2, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0,
         1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2,
         0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
         0, 0, 0, 0,

In [None]:
denseX.shape

(7, 121)

In [None]:
print(vectorizer.get_feature_names())

['10', '3rd', 'academic', 'all', 'also', 'although', 'am', 'an', 'and', 'appeal', 'application', 'approaches', 'are', 'as', 'attractiveness', 'be', 'been', 'being', 'benefits', 'both', 'but', 'by', 'career', 'changes', 'choice', 'clearly', 'cognitive', 'computer', 'contributions', 'current', 'currently', 'data', 'deep', 'despite', 'developing', 'distributional', 'do', 'doctorate', 'field', 'for', 'fortunes', 'foundations', 'from', 'fulfilling', 'given', 'have', 'here', 'hostage', 'however', 'ideas', 'implementation', 'in', 'industry', 'information', 'informed', 'into', 'involved', 'is', 'it', 'its', 'just', 'key', 'language', 'learning', 'level', 'linguistics', 'make', 'many', 'may', 'module', 'more', 'my', 'natural', 'neural', 'nlp', 'not', 'of', 'on', 'one', 'originally', 'other', 'others', 'pack', 'perspective', 'practical', 'processing', 'radical', 'recent', 'relevant', 'remain', 'retain', 'rewarding', 'science', 'sense', 'since', 'so', 'some', 'starting', 'studied', 'task', 'techn



In [None]:
# create new S and apply transform to it
new_sentence = "Now is the winter of our discontent."
new_sentence_vector = vectorizer.transform([new_sentence])

In [None]:
# only 3 of these words are in the vocabulary (is, the, of)
print(new_sentence_vector)
print(new_sentence_vector.todense())

  (0, 57)	1
  (0, 76)	1
  (0, 104)	1
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [None]:
# create a countVectorizer that ignores stop words
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)
print(vectorizer.get_feature_names())

['10', '3rd', 'academic', 'appeal', 'application', 'approaches', 'attractiveness', 'benefits', 'career', 'changes', 'choice', 'clearly', 'cognitive', 'computer', 'contributions', 'current', 'currently', 'data', 'deep', 'despite', 'developing', 'distributional', 'doctorate', 'field', 'fortunes', 'foundations', 'fulfilling', 'given', 'hostage', 'ideas', 'implementation', 'industry', 'information', 'informed', 'involved', 'just', 'key', 'language', 'learning', 'level', 'linguistics', 'make', 'module', 'natural', 'neural', 'nlp', 'originally', 'pack', 'perspective', 'practical', 'processing', 'radical', 'recent', 'relevant', 'remain', 'retain', 'rewarding', 'science', 'sense', 'starting', 'studied', 'task', 'technological', 'terms', 'theoretical', 'thinking', 'times', 'topics', 'transformation', 'tricky', 'undergraduate', 'view', 'witnessed', 'work', 'year', 'years']


In [None]:
X.shape

(7, 76)

In [None]:
# or we could limit the vocab using a cutoff
vectorizer = CountVectorizer(max_df=1)
X = vectorizer.fit_transform(sentences)
print(vectorizer.get_feature_names())

['10', '3rd', 'academic', 'also', 'although', 'am', 'an', 'appeal', 'application', 'approaches', 'are', 'as', 'attractiveness', 'be', 'been', 'being', 'benefits', 'both', 'by', 'career', 'changes', 'choice', 'clearly', 'cognitive', 'computer', 'contributions', 'current', 'currently', 'data', 'deep', 'despite', 'developing', 'distributional', 'do', 'doctorate', 'fortunes', 'foundations', 'from', 'fulfilling', 'given', 'here', 'hostage', 'however', 'ideas', 'implementation', 'industry', 'information', 'informed', 'into', 'involved', 'it', 'its', 'key', 'language', 'learning', 'level', 'linguistics', 'make', 'many', 'may', 'module', 'natural', 'neural', 'nlp', 'not', 'on', 'one', 'originally', 'other', 'others', 'pack', 'processing', 'radical', 'recent', 'relevant', 'remain', 'retain', 'rewarding', 'science', 'sense', 'since', 'so', 'some', 'starting', 'studied', 'task', 'technological', 'terms', 'than', 'theoretical', 'thinking', 'this', 'times', 'transformation', 'tricky', 'undergraduat

In [None]:
X.shape

(7, 102)