### Feature Extraction

In [None]:
from sklearn.feature_extraction import DictVectorizer

In [8]:
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Fransisco', 'temperature': 18.},
    {'city': 'New York', 'temperature': 10.}
]

In [13]:
vec = DictVectorizer()
vec.fit_transform(measurements).toarray()

array([[  1.,   0.,   0.,   0.,  33.],
       [  0.,   1.,   0.,   0.,  12.],
       [  0.,   0.,   0.,   1.,  18.],
       [  0.,   0.,   1.,   0.,  10.]])

In [12]:
vec.get_feature_names()

['city=Dubai',
 'city=London',
 'city=New York',
 'city=San Fransisco',
 'temperature']

In [14]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus).toarray()
X

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [29]:
#1-grams
vectorizer.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [35]:
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', min_df=1)
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2

array([[0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1]], dtype=int64)

In [36]:
#2-grams
bigram_vectorizer.get_feature_names()

['and the',
 'first document',
 'is the',
 'is this',
 'second document',
 'second second',
 'the first',
 'the second',
 'the third',
 'third one',
 'this is',
 'this the']

In [38]:
feature_index = bigram_vectorizer.vocabulary_.get('second document')
X_2[:, feature_index] 

array([0, 1, 0, 0], dtype=int64)

In [39]:
from sklearn.feature_extraction.text import TfidfTransformer

In [40]:
transformer = TfidfTransformer(smooth_idf=False)

In [41]:
counts = [[3, 0, 1],
    [2, 0, 0],
    [3, 0, 0],
    [4, 0, 0],
    [3, 2, 0],
    [3, 0, 2]
]

In [44]:
tfidf = transformer.fit_transform(counts).toarray()
tfidf

array([[ 0.81940995,  0.        ,  0.57320793],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.47330339,  0.88089948,  0.        ],
       [ 0.58149261,  0.        ,  0.81355169]])

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus).toarray()
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

### Feature Selection