In [1]:
#importing libraries
import pandas as pd
import numpy as np
from scipy import stats as st
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
#importing data
coupon = pd.read_csv('in-vehicle-coupon-recommendation.csv')

In [3]:
#Looking into data
coupon.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [4]:
#test dataset to test pipeline built
test = coupon[:30]

In [5]:
class TransformingColumns(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        print("Transformer initialized")
   
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):    
        gender_mapper = {'Female': 1, 'Male': 0}
        expiry_mapper = {'1d': 24, '2h': 2}
        age_mapper = {'50plus': 50,'below21': 18}
        time_mapper = {'6PM': 18, '7AM': 7, '10AM': 10, '2PM': 14, '10PM': 22}
        education_mapper = {'Some High School': 1, 'High School Graduate': 2, 'Some college - no degree': 3, 'Associates degree': 4, 'Bachelors degree': 5, 'Graduate degree (Masters or Doctorate)': 6}
        visit_mapper = {'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}
        income_ub_mapper = {'Less than $12500': 12499, '$12500 - $24999': 24999, '$25000 - $37499': 37499, '$37500 - $49999': 49999, '$50000 - $62499': 62499, '$62500 - $74999': 74999, '$75000 - $87499': 87499, '$87500 - $99999': 99999, '$100000 or More': 200000}
        income_lb_mapper = {'Less than $12500': 0, '$12500 - $24999': 12500, '$25000 - $37499': 25000, '$37500 - $49999': 37500, '$50000 - $62499': 50000, '$62500 - $74999': 62500, '$75000 - $87499': 75000, '$87500 - $99999': 87500, '$100000 or More': 100000}
        
        X['gender'] = X['gender'].replace(gender_mapper)
        X['expiration'] = X['expiration'].replace(expiry_mapper)
        X['age'] = X['age'].replace(age_mapper)
        X['time'] = X['time'].replace(time_mapper)
        X['education'] = X['education'].replace(education_mapper)
        X['Bar'] = X['Bar'].replace(visit_mapper)
        X['CoffeeHouse'] = X['CoffeeHouse'].replace(visit_mapper)
        X['CarryAway'] = X['CarryAway'].replace(visit_mapper)
        X['RestaurantLessThan20'] = X['RestaurantLessThan20'].replace(visit_mapper)
        X['Restaurant20To50'] = X['Restaurant20To50'].replace(visit_mapper)
        X['income_ub'] = X['income'].replace(income_ub_mapper)
        X['income_lb'] = X['income'].replace(income_lb_mapper)
        
        #Imputing with median values
        imputer = SimpleImputer(strategy="median")
        imputer.fit(X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']])
        X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']] = imputer.transform(X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']])
           
        X = pd.get_dummies(X, columns = ['destination', 'passanger', 'weather', 'coupon', 'maritalStatus', 'occupation'])
        X = X.drop(axis=1, columns=['car', 'toCoupon_GEQ5min', 'income'])
        
        return X

In [6]:
#Passing the transforming class defined above to pipeline
prod_pipeline = Pipeline([
        ('transformation', TransformingColumns()),
    ])

Transformer initialized


In [7]:
output = prod_pipeline.fit_transform(test
                                     
#Transformed data                                     
output

Unnamed: 0,temperature,time,expiration,gender,age,has_children,education,Bar,CoffeeHouse,CarryAway,...,weather_Sunny,coupon_Bar,coupon_Carry out & Take away,coupon_Coffee House,coupon_Restaurant(20-50),coupon_Restaurant(<20),maritalStatus_Single,maritalStatus_Unmarried partner,occupation_Architecture & Engineering,occupation_Unemployed
0,55,14,24,1,21,1,3,0.0,0.0,3.0,...,1,0,0,0,0,1,0,1,0,1
1,80,10,2,1,21,1,3,0.0,0.0,3.0,...,1,0,0,1,0,0,0,1,0,1
2,80,10,2,1,21,1,3,0.0,0.0,3.0,...,1,0,1,0,0,0,0,1,0,1
3,80,14,2,1,21,1,3,0.0,0.0,3.0,...,1,0,0,1,0,0,0,1,0,1
4,80,14,24,1,21,1,3,0.0,0.0,3.0,...,1,0,0,1,0,0,0,1,0,1
5,80,18,2,1,21,1,3,0.0,0.0,3.0,...,1,0,0,0,0,1,0,1,0,1
6,55,14,24,1,21,1,3,0.0,0.0,3.0,...,1,0,1,0,0,0,0,1,0,1
7,80,10,2,1,21,1,3,0.0,0.0,3.0,...,1,0,0,0,0,1,0,1,0,1
8,80,10,2,1,21,1,3,0.0,0.0,3.0,...,1,0,1,0,0,0,0,1,0,1
9,80,10,24,1,21,1,3,0.0,0.0,3.0,...,1,1,0,0,0,0,0,1,0,1
