In [1]:
# !pip install TPOT & !pip install tpot
from numpy import loadtxt 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import time

# conda install -c conda-forge tpot
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

**Short and very simple example** Find the best model & parameters to get the best score.

***Step 1. Loading & viewing dataset***

In [2]:
iris = load_iris()
iris.data[0:5], iris.target[:5]

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]), array([0, 0, 0, 0, 0]))

***Step 2. Train-test split***

In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    test_size=0.25, stratify = iris.target)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((112, 4), (38, 4), (112,), (38,))

In [4]:
tpot_clf = TPOTClassifier(generations=2, population_size=4, offspring_size=3, scoring='accuracy', cv=2,
                          verbosity=2, random_state=42)

# Fit the classifier to the training data
tpot_clf.fit(X_train, y_train)

# Score on the test set
print(tpot_clf.score(X_test, y_test))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10.0, style=ProgressStyle(des…

Generation 1 - Current best internal CV score: 0.9642857142857143
Generation 2 - Current best internal CV score: 0.9642857142857143

Best pipeline: RandomForestClassifier(Normalizer(input_matrix, norm=max), bootstrap=True, criterion=entropy, max_features=0.2, min_samples_leaf=8, min_samples_split=4, n_estimators=100)
0.9736842105263158


**2. Now, comeback with NLP Text Classification problem**

In [5]:
# load data
df = pd.read_csv("D://Nhan project 2019-2020//NLP//Project//train.csv", usecols = ["text", "target"])
df.head(5)

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
print("col_names : \t" + df.columns)
print('\n')
print("Data-dimensions: \t" + str(df.shape))
print('\n')
print("Count the not-null values of each features: \n" + str(df.notnull().sum()))

Index(['col_names : \ttext', 'col_names : \ttarget'], dtype='object')


Data-dimensions: 	(7613, 2)


Count the not-null values of each features: 
text      7613
target    7613
dtype: int64


In [7]:
df.drop_duplicates(inplace = True)
print("The new dimension after checking duplicate & removing is:\t" + str(df.shape))

The new dimension after checking duplicate & removing is:	(7521, 2)


In [8]:
df['Text_length'] = df['text'].str.len()
df['Numb_words'] = df['text'].str.split().map(lambda x: len(x))
df.head()

Unnamed: 0,text,target,Text_length,Numb_words
0,Our Deeds are the Reason of this #earthquake M...,1,69,13
1,Forest fire near La Ronge Sask. Canada,1,38,7
2,All residents asked to 'shelter in place' are ...,1,133,22
3,"13,000 people receive #wildfires evacuation or...",1,65,8
4,Just got sent this photo from Ruby #Alaska as ...,1,88,16


In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]

In [10]:
import re
from spellchecker import SpellChecker    

def process_text(str_input):
    ## 1. Remove url_link
    remove_url = re.compile(r'https?://\S+|www\.\S+').sub(r'', str_input)
    
    ## 2. Remove html_link
    remove_html = re.compile(r'<.*?>').sub(r'', remove_url)
    
    ## 3. Remove Emojis
    remove_emo = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE).sub(r'', remove_html)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", remove_emo).lower().split()    
        
    ## 4. spell_correction
    # spell = SpellChecker()
    # words = [spell.correction(word) for word in words]

    return words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
text_process = CountVectorizer(analyzer = process_text).fit_transform(df['text'])

In [12]:
y = df.target.to_numpy()
X = df[['text', 'Text_length']]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, stratify = y, random_state = 42)

In [13]:
X_train.head()

Unnamed: 0,text,Text_length
1617,SA MP. Steel and ferrochrome industry on verge...,133
1650,Petition | Heartless owner that whipped horse ...,128
6756,I'm a tornado looking for a soul to take,40
517,#WeLoveLA #NHLDucks Avalanche Defense: How The...,112
2968,@_jeesss_ @Ethereal_7 Hello 911 yeah we have s...,102


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer = process_text)

tfidf_train = tfidf_vect.fit(X_train['text']).transform(X_train['text']) 
tfidf_test = tfidf_vect.fit(X_train['text']).transform(X_test['text'])

In [15]:
X_train_vect = pd.concat([
                            X_train[['Text_length']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_train.toarray())
                        ], axis = 1)
X_train_vect.head()

Unnamed: 0,Text_length,0,1,2,3,4,5,6,7,8,...,14520,14521,14522,14523,14524,14525,14526,14527,14528,14529
0,133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
X_test_vect = pd.concat([
                            X_test[['Text_length']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test.toarray())
                        ], axis = 1)

In [17]:
start = time.time()
tpot_clf = TPOTClassifier(generations=2, population_size=4, offspring_size=3, scoring='accuracy', cv=2,
                          verbosity=2, random_state=42)

# Fit the classifier to the training data
tpot_clf.fit(X_train_vect, y_train)

# Score on the test set
print(tpot_clf.score(X_test_vect, y_test))
print ('Fit&trainning time : ', time.time() - start)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10.0, style=ProgressStyle(des…

Generation 1 - Current best internal CV score: 0.7551418439716312
Generation 2 - Current best internal CV score: 0.7562056737588652

Best pipeline: MultinomialNB(SGDClassifier(input_matrix, alpha=0.001, eta0=0.1, fit_intercept=True, l1_ratio=0.5, learning_rate=constant, loss=log, penalty=elasticnet, power_t=0.5), alpha=0.01, fit_prior=True)
0.7889420520999468
Fit&trainning time :  534.011625289917


In [18]:
## Set test_size = 0.3
test_size = 0.3
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify = y, random_state = 42)

tfidf_vect = TfidfVectorizer(analyzer = process_text)

tfidf_train = tfidf_vect.fit(X_train['text']).transform(X_train['text']) 
tfidf_test = tfidf_vect.fit(X_train['text']).transform(X_test['text'])

X_train_vect = pd.concat([
                            X_train[['Text_length']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_train.toarray())
                        ], axis = 1)

X_test_vect = pd.concat([
                            X_test[['Text_length']].reset_index(drop = True), 
                            pd.DataFrame(tfidf_test.toarray())
                        ], axis = 1)

tpot_clf = TPOTClassifier(generations=2, population_size=4, offspring_size=3 , cv=5,
                          verbosity=2, random_state=42)

# Fit the classifier to the training data
tpot_clf.fit(X_train_vect, y_train)

# Score on the test set
print(tpot_clf.score(X_test_vect, y_test))
print ('Fit&trainning time : ', time.time() - start)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=10.0, style=ProgressStyle(des…

Generation 1 - Current best internal CV score: 0.7690016574046992
Generation 2 - Current best internal CV score: 0.7690016574046992

Best pipeline: MultinomialNB(input_matrix, alpha=0.01, fit_prior=True)
0.7744793974302171
Fit&trainning time :  1089.5334739685059


In [19]:
## Set test_size = 0.3, population_size = 10 = generations, cv=5
start = time.time()
tpot_clf = TPOTClassifier(generations = 10, population_size= 10, verbosity=2, cv=5)

# Fit the classifier to the training data
tpot_clf.fit(X_train_vect, y_train)

# Score on the test set
print(tpot_clf.score(X_test_vect, y_test))
print ('Fit&trainning time : ', time.time() - start)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=110.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.7680487399752292
Generation 2 - Current best internal CV score: 0.7891324443288956
Generation 3 - Current best internal CV score: 0.7891324443288956
Generation 4 - Current best internal CV score: 0.8020533402662681
Generation 5 - Current best internal CV score: 0.8020533402662681
Generation 6 - Current best internal CV score: 0.8020533402662681
Generation 7 - Current best internal CV score: 0.8020533402662681
Generation 8 - Current best internal CV score: 0.8020533402662681
Generation 9 - Current best internal CV score: 0.8020533402662681
Generation 10 - Current best internal CV score: 0.8020533402662681

Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=True)
0.793531236154187
Fit&trainning time :  12551.517484664917
