In [104]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

def wm2df(wm, feat_names):
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names, columns=feat_names)
    return(df)

# set of documents
corpora = [
    'The quick brown fox&#x0002E;',
    'jumped over the lazy dog&#x00021;'
]
# instantiate the vectorizer object
cvec = CountVectorizer()
# convert the documents into a document-term matrix
wm = cvec.fit_transform(corpora)
# retrieve the terms found in the corpora
tokens = cvec.get_feature_names()
# create a dataframe from the matrix and print it
pd.DataFrame(data=wm.toarray(), index=['Doc1', 'Doc2'], columns=tokens)

Unnamed: 0,brown,dog,fox,jumped,lazy,over,quick,the,x00021,x0002e
Doc1,1,0,1,0,0,0,1,1,0,1
Doc2,0,1,0,1,1,1,0,1,1,0


In [62]:
# necessary imports
import re
import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix

def tokenize(corpus):
    # create a pattern to extract words
    pattern = re.compile(r'\b\w\w+\b')
    return(re.findall(pattern, corpus))

def set_weights(tokens):
    # create a dictionary to hold the tokens and their weights
    token_counts = defaultdict(int)
    # iterate over the tokens increasing their weights by 1
    for token in tokens:
        token_counts[token] += 1
    return(token_counts)

def simple_vectorizer(corpora):
    # create lists to hold the feature names, doc_counts and
    # matrix_rows
    feat_names = []
    doc_counts = []
    matrix_seed = []
    
    #iterate over the corpora and 
    for corpus in corpora:
        # tokenize docs
        tokens = tokenize(corpus)
        # assign the weights
        doc_count = set_weights(tokens)
        # add the feat names and vectorized docs to the matrix
        doc_counts.append(doc_count)
        feat_names.extend(doc_count.keys())
    
    # create a list of unique feat names
    unique_feat_names = list(set(feat_names))
    
    # assemble fill missing tokens with zeros
    for doc_count in doc_counts:
        matrix_row = [doc_count.get(feat_name, 0)\
                      for feat_name in unique_feat_names]
        matrix_seed.append(matrix_row)
        
    # create a sparse matrix
    matrix = csr_matrix(matrix_seed)
    return(unique_feat_names, csr_matrix(matrix_seed))

feat_names, wm = simple_vectorizer(corpora)
pd.DataFrame(data=wm.toarray(), index=['Doc1', 'Doc2'], columns=feat_names)

Unnamed: 0,bar,foo,brow,sentence,quick,fox,The
Doc1,0,0,1,0,1,1,1
Doc2,1,1,0,1,0,0,1


In [59]:
feat_names, wm = assemble(corpora)

In [60]:
pd.DataFrame(data=wm.toarray(), index=['Doc1', 'Doc2'], columns=feat_names)

Unnamed: 0,bar,foo,brow,sentence,quick,fox,The
Doc1,0,0,1,0,1,1,1
Doc2,1,1,0,1,0,0,1


In [128]:
import spacy
from html import unescape

# create a spaCy tokenizer
spacy.load('en')
lemmatizer = spacy.lang.en.English()

# remove html entities from docs and
# set everything to lower case
def my_preprocessor(doc):
    return(unescape(doc).lower())

# tokenize the doc and lemmatize its tokens
def my_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens])

corpora = [
    'The quick brown fox&#x0002E;',
    'jumped over the lazy dog&#x00021;'
]
custom_vec = CountVectorizer(preprocessor=my_preprocessor, tokenizer=my_tokenizer,
                             ngram_range=(1,2), stop_words='english')
cwm = custom_vec.fit_transform(corpora)
tokens = custom_vec.get_feature_names()
wm2df(cwm, tokens)

Unnamed: 0,!,.,brown,brown fox,dog,dog !,fox,fox .,jump,jump lazy,lazy,lazy dog,quick,quick brown
Doc0,0,1,1,1,0,0,1,1,0,0,0,0,1,1
Doc1,1,0,0,0,1,1,0,0,1,1,1,1,0,0


In [98]:
custom_vec.fit_transform(corpora)

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [93]:
corpora = ['The quick brown fox&#x0002E;','jumped over the lazy dog&#x00021;']

In [135]:
# create a custom analyzer class that can be called just like
# a function due to the __call__ special method
class MyAnalyzer(object):
    # loads the spaCy's english model and define the lemmatizer
    def __init__(self):
        spacy.load('en')
        self.lemmatizer_ = spacy.lang.en.English()
        
    # apply allows the class instance to be called just like
    # just like a function and applies the preprocessing and
    # tokenize the document
    def __call__(self, doc):
        doc_clean = unescape(doc).lower()
        tokens = self.lemmatizer_(doc_clean)
        return([token.lemma_ for token in tokens])
analyzer = MyAnalyzer()
custom_vec = CountVectorizer(analyzer=analyzer,
                             tokenizer=my_tokenizer,
                             ngram_range=(1,2),
                             stop_words='english')
cwm = custom_vec.fit_transform(corpora)
wm2df(cwm, custom_vec.get_feature_names())

Unnamed: 0,!,.,brown,dog,fox,jump,lazy,over,quick,the
Doc0,0,1,1,0,1,0,0,0,1,1
Doc1,1,0,0,1,0,1,1,1,0,1


In [131]:
analyzer = MyAnalyzer()

In [134]:
analyzer(corpora[0])

['the', 'quick', 'brown', 'fox', '.']

In [132]:
corpora[0]

'The quick brown fox&#x0002E;'

In [145]:
# defines a custom vectorizer class
class CustomVectorizer(CountVectorizer):
    
    def build_analyzer(self):
        # load stop words using CountVectorizer's built in method
        stop_words = self.get_stop_words()
        
        def analyser(doc):
#             spacy.load('en')
            lemmatizer = spacy.lang.en.English()
            doc_clean = unescape(doc).lower()
            tokens = lemmatizer(doc_clean)
            lemmatized_tokens = [token.lemma_ for token in tokens]
            return(self._word_ngrams(lemmatized_tokens, stop_words))
        return(analyser)
    
    
custom_vec = CustomVectorizer(ngram_range=(1,2),
                              stop_words='english')
cwm = custom_vec.fit_transform(corpora)
wm2df(cwm, custom_vec.get_feature_names())

Unnamed: 0,!,.,brown,brown fox,dog,dog !,fox,fox .,jump,jump lazy,lazy,lazy dog,quick,quick brown
Doc0,0,1,1,1,0,0,1,1,0,0,0,0,1,1
Doc1,1,0,0,0,1,1,0,0,1,1,1,1,0,0
