# <center> Vehicle Coupon Recommendation

## <center> Preprocessing

Data Source: https://archive.ics.uci.edu/ml/datasets/in-vehicle+coupon+recommendation

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
df = pd.read_csv("vehicle-coupon-recommendation.csv")
df.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  car                   108 non-null    object
 15  Bar                   12577 non-null

# Data preprocessing

## Variable type convert

In [22]:
df["temperature"].describe()

count    12684.000000
mean        63.301798
std         19.154486
min         30.000000
25%         55.000000
50%         80.000000
75%         80.000000
max         80.000000
Name: temperature, dtype: float64

In [23]:
df["temperature"].value_counts()

80    6528
55    3840
30    2316
Name: temperature, dtype: int64

In [24]:
df["has_children"].value_counts()

0    7431
1    5253
Name: has_children, dtype: int64

In [25]:
# convert numeric variables to category
df["temperature"] = df["temperature"].astype("category")
df["has_children"] = df["has_children"].astype("category")
df["toCoupon_GEQ5min"] = df["toCoupon_GEQ5min"].astype("category")
df["toCoupon_GEQ15min"] = df["toCoupon_GEQ15min"].astype("category")
df["toCoupon_GEQ25min"] = df["toCoupon_GEQ25min"].astype("category")
df["direction_same"] = df["direction_same"].astype("category")

df["Y"] = df["Y"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   destination           12684 non-null  object  
 1   passanger             12684 non-null  object  
 2   weather               12684 non-null  object  
 3   temperature           12684 non-null  category
 4   time                  12684 non-null  object  
 5   coupon                12684 non-null  object  
 6   expiration            12684 non-null  object  
 7   gender                12684 non-null  object  
 8   age                   12684 non-null  object  
 9   maritalStatus         12684 non-null  object  
 10  has_children          12684 non-null  category
 11  education             12684 non-null  object  
 12  occupation            12684 non-null  object  
 13  income                12684 non-null  object  
 14  car                   108 non-null    object  
 15  Ba

## Missing value correct

In [26]:
df.drop(columns=["car", "direction_opp"], axis=1, inplace=True)
# car: too many missing values
# direction_opp: same as direction_same
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   destination           12684 non-null  object  
 1   passanger             12684 non-null  object  
 2   weather               12684 non-null  object  
 3   temperature           12684 non-null  category
 4   time                  12684 non-null  object  
 5   coupon                12684 non-null  object  
 6   expiration            12684 non-null  object  
 7   gender                12684 non-null  object  
 8   age                   12684 non-null  object  
 9   maritalStatus         12684 non-null  object  
 10  has_children          12684 non-null  category
 11  education             12684 non-null  object  
 12  occupation            12684 non-null  object  
 13  income                12684 non-null  object  
 14  Bar                   12577 non-null  object  
 15  Co

In [27]:
df = df.dropna() # it's ok to just drop rows with missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12079 entries, 22 to 12683
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   destination           12079 non-null  object  
 1   passanger             12079 non-null  object  
 2   weather               12079 non-null  object  
 3   temperature           12079 non-null  category
 4   time                  12079 non-null  object  
 5   coupon                12079 non-null  object  
 6   expiration            12079 non-null  object  
 7   gender                12079 non-null  object  
 8   age                   12079 non-null  object  
 9   maritalStatus         12079 non-null  object  
 10  has_children          12079 non-null  category
 11  education             12079 non-null  object  
 12  occupation            12079 non-null  object  
 13  income                12079 non-null  object  
 14  Bar                   12079 non-null  object  
 15  C

## Feature engineering

In [28]:
## combine toCoupon_GEQ5min, 15min, 25min to one 

# create a function to define the new column values based on conditions
def get_toCoupon(row):
    if (row["toCoupon_GEQ5min"]==1) & (row["toCoupon_GEQ15min"]==0) & (row["toCoupon_GEQ25min"]==0):
        return "5-15"
    elif (row["toCoupon_GEQ5min"]==1) & (row["toCoupon_GEQ15min"]==1) & (row["toCoupon_GEQ25min"]==0):
        return "15-25"
    else: 
        return ">25"

df["to_Coupon"] = df.apply(get_toCoupon, axis=1)
df = df.drop(columns=["toCoupon_GEQ5min", "toCoupon_GEQ15min", "toCoupon_GEQ25min"], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12079 entries, 22 to 12683
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   destination           12079 non-null  object  
 1   passanger             12079 non-null  object  
 2   weather               12079 non-null  object  
 3   temperature           12079 non-null  category
 4   time                  12079 non-null  object  
 5   coupon                12079 non-null  object  
 6   expiration            12079 non-null  object  
 7   gender                12079 non-null  object  
 8   age                   12079 non-null  object  
 9   maritalStatus         12079 non-null  object  
 10  has_children          12079 non-null  category
 11  education             12079 non-null  object  
 12  occupation            12079 non-null  object  
 13  income                12079 non-null  object  
 14  Bar                   12079 non-null  object  
 15  C

In [29]:
# move Y to the most right 
Y = df.pop('Y')
df.insert(len(df.columns), 'Y', Y)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12079 entries, 22 to 12683
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   destination           12079 non-null  object  
 1   passanger             12079 non-null  object  
 2   weather               12079 non-null  object  
 3   temperature           12079 non-null  category
 4   time                  12079 non-null  object  
 5   coupon                12079 non-null  object  
 6   expiration            12079 non-null  object  
 7   gender                12079 non-null  object  
 8   age                   12079 non-null  object  
 9   maritalStatus         12079 non-null  object  
 10  has_children          12079 non-null  category
 11  education             12079 non-null  object  
 12  occupation            12079 non-null  object  
 13  income                12079 non-null  object  
 14  Bar                   12079 non-null  object  
 15  C

# Output

In [30]:
df.to_csv("VCR_clean_real.csv", index=False)