In [1]:
#Write my own transformers
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.pipeline import Pipeline, FeatureUnion, make_union

from sklearn.base import BaseEstimator, TransformerMixin


In [2]:
df = pd.read_csv("./data/SMSSpamCollection", sep="\t", 
                 header=None, 
                 names=["target", "text"])

X = df["text"]
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
X_train.ix[1780, 'text']

"Loan for any purpose \xc2\xa3500 - \xc2\xa375,000. Homeowners + Tenants welcome. Have you been previously refused? We can still help. Call Free 0800 1956669 or text back 'help'"

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import re

class CapitalDocTransformer(BaseEstimator, TransformerMixin):
    """
    Transforms the input document to either 1 or 0. 
    Returns 1 if all words in the document are Capital else Returns 0.
    """

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # if all the words in each line of X is A-Z only, then return 1 else return 0
        X_new = np.array([line == line.upper() for line in X]).astype(int)
        return X_new.reshape(-1, 1)

In [6]:
cp_transformer = CapitalDocTransformer()
cp_transformer.fit_transform(X[:])

array([[0],
       [0],
       [0],
       ..., 
       [0],
       [0],
       [0]])

In [7]:
# Build a Model
log_reg_model = Pipeline(steps=[
        ("features", make_union(CapitalDocTransformer(), CountVectorizer())),
        ("model", LogisticRegression())
        ])

In [8]:
log_reg_model.fit(X_train, y_train)
# Accuracy
log_reg_model.score(X_test, y_test)

0.98504784688995217

In [16]:
from sklearn.externals import joblib

joblib.dump(log_reg_model, 'models/spam_ham.pkl')

new_model = joblib.load('models/spam_ham.pkl')

In [43]:
new_model.predict(['SPAM'])

array(['ham'], dtype=object)