#### Lets take some sample sentences and try converting them to vectors

In [20]:
sample_sentences = [
    'Joe waited for the train',
    'The train was late.',
    'Mary and Samantha took the bus.',
    'I looked for Mary and Samantha at the bus station.',
    'Mary and Samantha arrived at the bus station early but waited until noon for the bus.'
]

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample_sentences)

In [22]:
X.toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 1, 1, 0]],
      dtype=int64)

#### table view helps to understand the result better, columns with all the words and 0s or 1s as row values

In [23]:
import pandas as pd
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,and,arrived,at,bus,but,early,for,joe,late,looked,mary,noon,samantha,station,the,took,train,until,waited,was
0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,1,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1
2,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0
3,1,0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,0,0,0,0
4,1,1,1,2,1,1,1,0,0,0,1,1,1,1,2,0,0,1,1,0


#### using term frequeny inverse document frequency vectorizer, wil measure how often the word appears in the document

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample_sentences)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,and,arrived,at,bus,but,early,for,joe,late,looked,mary,noon,samantha,station,the,took,train,until,waited,was
0,0.0,0.0,0.0,0.0,0.0,0.0,0.388123,0.579537,0.0,0.0,0.0,0.0,0.0,0.0,0.276152,0.0,0.467567,0.0,0.467567,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.589463,0.0,0.0,0.0,0.0,0.0,0.280882,0.0,0.475575,0.0,0.0,0.589463
2,0.385305,0.0,0.0,0.385305,0.0,0.0,0.0,0.0,0.0,0.0,0.385305,0.0,0.385305,0.0,0.274148,0.575329,0.0,0.0,0.0,0.0
3,0.306593,0.0,0.369349,0.306593,0.0,0.0,0.306593,0.0,0.0,0.457799,0.306593,0.0,0.306593,0.369349,0.218143,0.0,0.0,0.0,0.0,0.0
4,0.197926,0.295539,0.238439,0.395852,0.295539,0.295539,0.197926,0.0,0.0,0.0,0.197926,0.295539,0.197926,0.238439,0.281652,0.0,0.0,0.295539,0.238439,0.0


#### remove stopwords by passing stop_words parameters

In [25]:
vec = TfidfVectorizer(stop_words='english')
X = vec.fit_transform(sample_sentences)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,arrived,bus,early,joe,late,looked,mary,noon,samantha,station,took,train,waited
0,0.0,0.0,0.0,0.659118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.531772,0.531772
1,0.0,0.0,0.0,0.0,0.778283,0.0,0.0,0.0,0.0,0.0,0.0,0.627914,0.0
2,0.0,0.437287,0.0,0.0,0.0,0.0,0.437287,0.0,0.437287,0.0,0.652948,0.0,0.0
3,0.0,0.386887,0.0,0.0,0.0,0.577691,0.386887,0.0,0.386887,0.466078,0.0,0.0,0.0
4,0.378156,0.506511,0.378156,0.0,0.0,0.0,0.253255,0.378156,0.253255,0.305094,0.0,0.0,0.305094


In [26]:
analyze = vec.build_analyzer()
analyze('Joe waited for the train')

[u'joe', u'waited', u'train']

In [27]:
print vec.vocabulary_.get('train')

11
