In [5]:
import pandas as pd
import numpy as np
from scipy import sparse

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk import word_tokenize

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [2]:
# default params
scoring='roc_auc'
cv=3
n_jobs=-1
max_features = 2500

In [6]:
class Stemmer(BaseEstimator):
    def __init__(self):
        self.l = PorterStemmer()
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        x = map(lambda text:  ' '.join([self.l.stem(word.lower()) for word in text.split()]), x)
        x = np.array(list(x))
        return x

In [12]:
tfidf = TfidfVectorizer(max_features=max_features, tokenizer=word_tokenize, ngram_range=(2, 2), analyzer='word', 
                        stop_words='english')
sm = Stemmer()
lr = LogisticRegression()
p = Pipeline([
    ('sm', sm),
    ('tfidf', tfidf),
    ('lr', lr)
])

# cross_val_score(estimator=p, X=x, y=y, scoring=scoring, cv=cv, n_jobs=n_jobs)

In [13]:
sm = Stemmer()
tfidf_w = TfidfVectorizer(max_features=max_features, tokenizer=word_tokenize, ngram_range=(2, 2),analyzer='word', 
                          stop_words='english')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char')
lr = LogisticRegression()
p = Pipeline([
    ('sm', sm),
    ('wc_tfidfs', 
         FeatureUnion([
            ('tfidf_w', tfidf_w), 
            ('tfidf_c', tfidf_c), 
         ])
    ),
    ('lr', lr)
])

# cross_val_score(estimator=p, X=x, y=y, scoring=scoring, cv=cv, n_jobs=n_jobs)