In [1]:
#importing libraries
import pandas as pd
import numpy as np
from scipy import stats as st
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import warnings
from sklearn.preprocessing import OneHotEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
warnings.filterwarnings('ignore')

In [2]:
#importing data
coupon = pd.read_csv('in-vehicle-coupon-recommendation.csv')

In [3]:
#Looking into data
coupon.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [4]:
coupon.dtypes

destination             object
passanger               object
weather                 object
temperature              int64
time                    object
coupon                  object
expiration              object
gender                  object
age                     object
maritalStatus           object
has_children             int64
education               object
occupation              object
income                  object
car                     object
Bar                     object
CoffeeHouse             object
CarryAway               object
RestaurantLessThan20    object
Restaurant20To50        object
toCoupon_GEQ5min         int64
toCoupon_GEQ15min        int64
toCoupon_GEQ25min        int64
direction_same           int64
direction_opp            int64
Y                        int64
dtype: object

In [5]:
#Separate response and features
response = coupon["Y"]
df_features = coupon.drop("Y",axis=1)

In [6]:
#Split training and testing data 
X_train, X_test, y_train, y_test = train_test_split(df_features, response, test_size=0.3, random_state=695)

In [7]:
class TransformingColumns(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        print("Transformer initialized")
   
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):    
        gender_mapper = {'Female': 1, 'Male': 0}
        expiry_mapper = {'1d': 24, '2h': 2}
        age_mapper = {'50plus': 50,'below21': 18}
        time_mapper = {'6PM': 18, '7AM': 7, '10AM': 10, '2PM': 14, '10PM': 22}
        education_mapper = {'Some High School': 1, 'High School Graduate': 2, 'Some college - no degree': 3, 'Associates degree': 4, 'Bachelors degree': 5, 'Graduate degree (Masters or Doctorate)': 6}
        visit_mapper = {'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}
        income_ub_mapper = {'Less than $12500': 12499, '$12500 - $24999': 24999, '$25000 - $37499': 37499, '$37500 - $49999': 49999, '$50000 - $62499': 62499, '$62500 - $74999': 74999, '$75000 - $87499': 87499, '$87500 - $99999': 99999, '$100000 or More': 200000}
        income_lb_mapper = {'Less than $12500': 0, '$12500 - $24999': 12500, '$25000 - $37499': 25000, '$37500 - $49999': 37500, '$50000 - $62499': 50000, '$62500 - $74999': 62500, '$75000 - $87499': 75000, '$87500 - $99999': 87500, '$100000 or More': 100000}
        
        X['gender'] = pd.to_numeric(X['gender'].replace(gender_mapper))
        X['expiration'] = pd.to_numeric(X['expiration'].replace(expiry_mapper))
        X['age'] = pd.to_numeric(X['age'].replace(age_mapper))
        X['time'] = pd.to_numeric(X['time'].replace(time_mapper))
        X['education'] = pd.to_numeric(X['education'].replace(education_mapper))
        X['Bar'] = X['Bar'].replace(visit_mapper)
        X['CoffeeHouse'] = X['CoffeeHouse'].replace(visit_mapper)
        X['CarryAway'] = X['CarryAway'].replace(visit_mapper)
        X['RestaurantLessThan20'] = X['RestaurantLessThan20'].replace(visit_mapper)
        X['Restaurant20To50'] = X['Restaurant20To50'].replace(visit_mapper)
        X['income_ub'] = pd.to_numeric(X['income'].replace(income_ub_mapper))
        X['income_lb'] = pd.to_numeric(X['income'].replace(income_lb_mapper))
        
        #Initialize Imputer
        imputer = SimpleImputer(strategy="median")
        imputer.fit(X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']])
        X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']] = imputer.transform(X[['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']])
        
        X = X.drop(axis=1, columns=['car', 'toCoupon_GEQ5min', 'income'])
        
        return X

In [8]:
#Define features that need to be transformed
trans_features = ["gender","expiration","age","time","education","Bar",
                  "CoffeeHouse","CarryAway","RestaurantLessThan20","Restaurant20To50","income",
                  'car', 'toCoupon_GEQ5min']
#Define categorical features that need to be transformed
cat_features = ['destination', 'passanger', 'weather', 'coupon', 'maritalStatus', 'occupation']
#Define features that need imputing
#imp_features = ['Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

#Initialize LeaveOneOutEncoder
loo_transformer = LeaveOneOutEncoder(handle_unknown="value", sigma = 0.1)


In [9]:
#Passing the transforming class defined above to pipeline
prod_pipeline = ColumnTransformer([
        ('transformation', TransformingColumns(), trans_features),
        ("cat",loo_transformer,cat_features)     
    ],
    remainder = "passthrough"
)

Transformer initialized


In [10]:
#Fit transformer to X_train, y_train
prod_pipeline.fit(X_train, y_train)

Transformer initialized


ColumnTransformer(remainder='passthrough',
                  transformers=[('transformation', TransformingColumns(),
                                 ['gender', 'expiration', 'age', 'time',
                                  'education', 'Bar', 'CoffeeHouse',
                                  'CarryAway', 'RestaurantLessThan20',
                                  'Restaurant20To50', 'income', 'car',
                                  'toCoupon_GEQ5min']),
                                ('cat', LeaveOneOutEncoder(sigma=0.1),
                                 ['destination', 'passanger', 'weather',
                                  'coupon', 'maritalStatus', 'occupation'])])

In [11]:
#Define columns
data_columns = ["gender","expiration","age","time","education","Bar",
                  "CoffeeHouse","CarryAway","RestaurantLessThan20","Restaurant20To50","income_ub","income_lb"]
data_columns = data_columns + ['destination', 'passanger', 'weather', 'coupon', 'maritalStatus', 'occupation']
data_columns = data_columns + ['temperature','has_children','toCoupon_GEQ15min', 'toCoupon_GEQ25min','direction_same', 'direction_opp']
len(data_columns)

24

In [12]:
train_transformed = pd.DataFrame(prod_pipeline.transform(X_train), columns=data_columns)

In [13]:
test_transformed = pd.DataFrame(prod_pipeline.transform(X_test), columns = data_columns)

In [14]:
#Attach response variable to both train and test sets
train_transformed["Y"] = y_train
test_transformed["Y"] = y_test

In [15]:
#Export to .csv
train_transformed.to_csv("transformed_train_data.csv")
train_transformed.to_csv("transformed_test_data.csv")

In [16]:
#Fit transform entire dataset
coupon_transformed = prod_pipeline.fit_transform(df_features, response)

Transformer initialized


In [17]:
coupon_transformed = pd.DataFrame(coupon_transformed, columns = data_columns)

In [18]:
coupon_transformed["Y"] = response

In [19]:
coupon_transformed.to_csv("coupon_transformed.csv")