In [1]:
from tpot import TPOTClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from scipy import stats as st
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
dataset = pd.read_csv('train_data.csv')
#testset = pd.read_csv('test_data.csv')

Running TPOT after passing the data through transformation pipeline to get a general idea of best estimator.

### Transforming data

In [3]:
class TransformingColumns(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        print("Transformer initialized")
   
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):    
        gender_mapper = {'Female': 1, 'Male': 0}
        expiry_mapper = {'1d': 24, '2h': 2}
        age_mapper = {'50plus': 50,'below21': 18}
        time_mapper = {'6PM': 18, '7AM': 7, '10AM': 10, '2PM': 14, '10PM': 22}
        education_mapper = {'Some High School': 1, 'High School Graduate': 2, 'Some college - no degree': 3, 'Associates degree': 4, 'Bachelors degree': 5, 'Graduate degree (Masters or Doctorate)': 6}
        visit_mapper = {'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}
        income_ub_mapper = {'Less than $12500': 12499, '$12500 - $24999': 24999, '$25000 - $37499': 37499, '$37500 - $49999': 49999, '$50000 - $62499': 62499, '$62500 - $74999': 74999, '$75000 - $87499': 87499, '$87500 - $99999': 99999, '$100000 or More': 200000}
        income_lb_mapper = {'Less than $12500': 0, '$12500 - $24999': 12500, '$25000 - $37499': 25000, '$37500 - $49999': 37500, '$50000 - $62499': 50000, '$62500 - $74999': 62500, '$75000 - $87499': 75000, '$87500 - $99999': 87500, '$100000 or More': 100000}
        
        X['gender'] = X['gender'].replace(gender_mapper)
        X['expiration'] = X['expiration'].replace(expiry_mapper)
        X['age'] = X['age'].replace(age_mapper)
        X['time'] = X['time'].replace(time_mapper)
        X['education'] = X['education'].replace(education_mapper)
        X['Bar'] = X['Bar'].replace(visit_mapper)
        X['CoffeeHouse'] = X['CoffeeHouse'].replace(visit_mapper)
        X['CarryAway'] = X['CarryAway'].replace(visit_mapper)
        X['RestaurantLessThan20'] = X['RestaurantLessThan20'].replace(visit_mapper)
        X['Restaurant20To50'] = X['Restaurant20To50'].replace(visit_mapper)
        X['income_ub'] = X['income'].replace(income_ub_mapper)
        X['income_lb'] = X['income'].replace(income_lb_mapper)
        
        #Splitting date column
        X[["day", "month", "year"]] = X["Date"].str.split("-", expand = True)
        
        #Imputing with median values
        imputer = SimpleImputer(strategy="median")
        imputer.fit(X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']])
        X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']] = imputer.transform(X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']])
           
        X = pd.get_dummies(X, columns = ['destination', 'passanger', 'coupon', 'maritalStatus', 'occupation'])
        X = X.drop(axis=1, columns=['Unnamed: 0', 'Label', 'Date', 'income', 'car'])
        
        return X

In [4]:
#Passing the transforming class defined above to pipeline
prod_pipeline = Pipeline([
        ('transformation', TransformingColumns()),
    ])

Transformer initialized


In [5]:
coupon = prod_pipeline.fit_transform(dataset)
                                     
#Transformed data                                     
coupon

Unnamed: 0,time,expiration,gender,age,has_children,education,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,...,occupation_Management,occupation_Office & Administrative Support,occupation_Personal Care & Service,occupation_Production Occupations,occupation_Protective Service,occupation_Retired,occupation_Sales & Related,occupation_Student,occupation_Transportation & Material Moving,occupation_Unemployed
0,14,24,1,21,1.0,3,0.0,0.0,2.0,3.0,...,0,0,0,0,0,0,0,0,0,1
1,14,24,0,21,0.0,5,0.0,1.0,3.0,3.0,...,0,0,0,0,0,0,0,0,0,0
2,14,24,1,26,1.0,3,0.0,1.0,4.0,3.0,...,0,0,0,0,0,0,0,0,0,1
3,18,2,0,50,1.0,6,2.0,2.0,3.0,3.0,...,0,0,0,0,0,0,1,0,0,0
4,7,24,0,50,0.0,3,0.0,0.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,7,24,1,31,0.0,6,1.0,2.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
12680,22,2,1,31,1.0,5,0.0,1.0,4.0,3.0,...,0,0,0,0,0,0,0,1,0,0
12681,10,2,1,36,0.0,2,1.0,1.0,2.0,1.0,...,0,0,0,0,0,0,0,0,0,0
12682,22,2,0,21,0.0,3,3.0,2.0,3.0,2.0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
coupon.to_csv('new_data.csv', index=False)

In [7]:
#Splitting data into features and target

X = coupon.drop(axis=1, columns=['Y'])
y = coupon['Y']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10147, 67), (2537, 67), (10147,), (2537,))

In [10]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=30, scoring='accuracy')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


31.88 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: RandomForestClassifier(Normalizer(input_matrix, norm=max), bootstrap=True, criterion=gini, max_features=0.5, min_samples_leaf=1, min_samples_split=9, n_estimators=100)
0.7591643673630272


In [11]:
#Best estimator determined by TPOT in 30 mins
tpot.fitted_pipeline_

Pipeline(steps=[('normalizer', Normalizer(norm='max')),
                ('randomforestclassifier',
                 RandomForestClassifier(max_features=0.5,
                                        min_samples_split=9))])

In [12]:
print(tpot.score(X_test, y_test))

0.7591643673630272


In [13]:
tpot.export('tpot_coupon_dataset_all_columns.py')