# CountVectorizer

is a class which is required to create dictionary of key words (.fit_transform(...) method), issue them labels and establish frequencies; .transform(...) method whether words from the dictionary are present in a fed string and what are their frequencies 

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 

In [2]:
docs = ["the house had a tiny little mouse mouse", 
"the cat saw the mouse", 
"the mouse ran away from the house", 
"the cat finally ate the mouse", 
"the end of the mouse story"
]

In [3]:
cv = CountVectorizer(max_features=16, stop_words='english')

### let's create sparce matrix which represents frequencies of words in lines/elements of docs:

### cv.fit_transform(docs)

In [4]:
word = cv.fit_transform(docs)

### the dictionary of important/unique words is formed; each word is given a unique label:

In [5]:
cv.vocabulary_

{'house': 5,
 'tiny': 11,
 'little': 6,
 'mouse': 7,
 'cat': 2,
 'saw': 9,
 'ran': 8,
 'away': 1,
 'finally': 4,
 'ate': 0,
 'end': 3,
 'story': 10}

### 'word_' is a sparce matrix:

In [6]:
type(word)

scipy.sparse.csr.csr_matrix

In [7]:
print(word)

  (0, 5)	1
  (0, 11)	1
  (0, 6)	1
  (0, 7)	2
  (1, 7)	1
  (1, 2)	1
  (1, 9)	1
  (2, 5)	1
  (2, 7)	1
  (2, 8)	1
  (2, 1)	1
  (3, 7)	1
  (3, 2)	1
  (3, 4)	1
  (3, 0)	1
  (4, 7)	1
  (4, 3)	1
  (4, 10)	1


'(0, 5)	1' means that in docs[0] which is "the house had a tiny little mouse mouse" the word with label 5 ('house') appears 1 time

'(0, 7)	2' means that in docs[0] which is "the house had a tiny little mouse mouse" the word with label 7 ('mouse') appears 2 times

### cv.transform(...)

In [8]:
print(docs[:1])
print(cv.transform(docs[:1]))

['the house had a tiny little mouse mouse']
  (0, 5)	1
  (0, 6)	1
  (0, 7)	2
  (0, 11)	1


In [9]:
sorted(cv.vocabulary_)

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'house',
 'little',
 'mouse',
 'ran',
 'saw',
 'story',
 'tiny']

### let's create frequency table of key words in lines:

In [10]:
# pd.DataFrame.sparse.from_spmatrix(word_,columns = cv.vocabulary_.keys())  # the given code was not correct
pd.DataFrame.sparse.from_spmatrix(word, columns=sorted(cv.vocabulary_))  # corrected version

Unnamed: 0,ate,away,cat,end,finally,house,little,mouse,ran,saw,story,tiny
0,0,0,0,0,0,1,1,2,0,0,0,1
1,0,0,1,0,0,0,0,1,0,1,0,0
2,0,1,0,0,0,1,0,1,1,0,0,0
3,1,0,1,0,1,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,1,0,0,1,0


In [11]:
word.shape

(5, 12)

In [12]:
print(cv.transform(["house house house mouse"]))

  (0, 5)	3
  (0, 7)	1


# TFiDF

tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general.

tf = number of times that the term occurs in document / total number of words in document

idf = log (total number of documents / number of documents that contain the term)

tf-idf = tf * idf

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tf = TfidfVectorizer(stop_words='english', max_features=16)

In [15]:
# tf.fit(docs)
# word = tf.transform(docs)
word = tf.fit_transform(docs)

In [16]:
tf.vocabulary_

{'house': 5,
 'tiny': 11,
 'little': 6,
 'mouse': 7,
 'cat': 2,
 'saw': 9,
 'ran': 8,
 'away': 1,
 'finally': 4,
 'ate': 0,
 'end': 3,
 'story': 10}

In [None]:
tf.get_feature_names()

In [17]:
sorted(tf.vocabulary_)

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'house',
 'little',
 'mouse',
 'ran',
 'saw',
 'story',
 'tiny']

In [18]:
print(word)

  (0, 7)	0.5051552983522339
  (0, 6)	0.5300625584039018
  (0, 11)	0.5300625584039018
  (0, 5)	0.4276511345750014
  (1, 9)	0.7297183669435993
  (1, 2)	0.5887321837696324
  (1, 7)	0.3477147117091919
  (2, 1)	0.5894630806320427
  (2, 8)	0.5894630806320427
  (2, 7)	0.2808823162882302
  (2, 5)	0.47557510189256375
  (3, 0)	0.5894630806320427
  (3, 4)	0.5894630806320427
  (3, 2)	0.47557510189256375
  (3, 7)	0.2808823162882302
  (4, 10)	0.6700917930430479
  (4, 3)	0.6700917930430479
  (4, 7)	0.3193023297639811


In [19]:
pd.DataFrame.sparse.from_spmatrix(word, columns=sorted(tf.vocabulary_))

Unnamed: 0,ate,away,cat,end,finally,house,little,mouse,ran,saw,story,tiny
0,0.0,0.0,0.0,0.0,0.0,0.427651,0.530063,0.505155,0.0,0.0,0.0,0.530063
1,0.0,0.0,0.588732,0.0,0.0,0.0,0.0,0.347715,0.0,0.729718,0.0,0.0
2,0.0,0.589463,0.0,0.0,0.0,0.475575,0.0,0.280882,0.589463,0.0,0.0,0.0
3,0.589463,0.0,0.475575,0.0,0.589463,0.0,0.0,0.280882,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.670092,0.0,0.0,0.0,0.319302,0.0,0.0,0.670092,0.0
