<a href="https://colab.research.google.com/github/MRM07/EverythingDataScience/blob/master/Text_Pipeline_Support_for_TPOT_Framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('names')
from nltk.corpus import stopwords

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


In [2]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from nltk.tokenize import word_tokenize
from sklearn.base import TransformerMixin, BaseEstimator
from normalise import normalise

nlp = en_core_web_sm.load()



# Pipelining our Pre-Processing /Feature Engg for Datasets which contain Text Data #

*   We create a selector transformer that simply returns us  the values of columns by key we pass. This selector is for both numeric/text type 


# Reusable pipeline

* We are going to cretate reusable pipeline, which we could use on any of our NLP projects, again a transformer function. Similar to previous













In [0]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [0]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 variety="BrE",
                 user_abbrevs={},
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        variety - format of date (AmE - american type, BrE - british format) 
        user_abbrevs - dict of user abbreviations mappings (from normalise package)
        n_jobs - parallel jobs to run
        """
        self.variety = variety
        self.user_abbrevs = user_abbrevs
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, variety=self.variety, user_abbrevs=self.user_abbrevs, verbose=False))
        except:
            return text

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [0]:
import pandas as pd 

In [0]:
training_set= pd.read_csv("Train_email.csv")

In [0]:
training_set["Intent"]= training_set["Intent"].map({"No": 0, "Yes": 1})

In [0]:
import re

# Our Testing Dataset#

We are trying to create a **TEXT PIPELINE SUPPORT FOR TPOT FRAMEWORK** 

TPOT doesnt support Text Pre-Processing on its own, hence we added ou own pipeline and integrated it with TPOT Auto-ML.


In [9]:
#creating a function to encapsulate our entire preprocessing and feature engineering steps
def processing(df):
    #get the no. of punctuations
    df['specialchars'] = df['text'].apply(lambda x : len(x) - len(re.findall('[\w]', x)))
    #count the no. of words in the text
    df['words'] = df['text'].apply(lambda x: len(x.split(' ')))
    
    return df
    
df= processing(training_set)   
df.head()

Unnamed: 0,Intent,text,specialchars,words
0,0,>>> [1]Contact Me Now to Make $100 Today!$LINK,15,8
1,0,Act now to keep your life on the go!,9,9
2,0,Choose between $500 and $10000 dollars with up...,15,13
3,0,Click above to earn today.,5,5
4,0,Click here to receive your first $10 today:,9,8


In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
                ('selector', TextSelector(key='text')),
                ('pre-processing', TextPreprocessor(n_jobs= -1)),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

In [0]:
from sklearn.preprocessing import StandardScaler


words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
specialchars =  Pipeline([
                ('selector', NumberSelector(key='specialchars')),
                ('standard', StandardScaler()),
            ])

In [0]:
from sklearn.pipeline import FeatureUnion
feats = FeatureUnion([('text', text),              
                      ('words', words),
                      ('specialchars', specialchars),
                      ])
feature_processing = Pipeline([('feats', feats)])

In [0]:
tpot_config = {
    'sklearn.svm.LinearSVC': {
    },
    'sklearn.naive_bayes.GaussianNB': {
    },
    'sklearn.ensemble.RandomForestClassifier' : {
    },
    'sklearn.ensemble.GradientBoostingClassifier':{
    }   
    
}

In [0]:
from tpot import TPOTClassifier


In [0]:
clf = Pipeline([
    ('features', feats),
    ('classifier', TPOTClassifier(generations= 20, verbosity=2, max_time_mins=4, 
                                  max_eval_time_mins=0.04, population_size=20, config_dict=tpot_config))])

In [0]:
from sklearn.model_selection import train_test_split
X= df.drop(['Intent'], axis= 1)
y= df["Intent"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20)

In [17]:
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=20, style=ProgressStyle(descripti…

Generation 1 - Current best internal CV score: 0.6693867084806352
Generation 2 - Current best internal CV score: 0.6693867084806352
Generation 3 - Current best internal CV score: 0.6693867084806352
Generation 4 - Current best internal CV score: 0.6693867084806352
Generation 5 - Current best internal CV score: 0.6693867084806352
Generation 6 - Current best internal CV score: 0.6693867084806352
Generation 7 - Current best internal CV score: 0.6693867084806352
Generation 8 - Current best internal CV score: 0.6693867084806352
Generation 9 - Current best internal CV score: 0.6693867084806352
Generation 10 - Current best internal CV score: 0.6693867084806352
Generation 11 - Current best internal CV score: 0.6693867084806352
Generation 12 - Current best internal CV score: 0.6693867084806352
Generation 13 - Current best internal CV score: 0.6693867084806352
Generation 14 - Current best internal CV score: 0.6693867084806352
Generation 15 - Current best internal CV score: 0.6693867084806352
Gene

Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-4-b3859dce2d5b>", line 45, in _preprocess_part
    return part.apply(self._preprocess_text)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.6/dist-packages/pandas/core/series.py", line 3591, in apply
    mapped = lib.map_infer(valu

KeyboardInterrupt: ignored