<a href="https://colab.research.google.com/github/Maagnitude/coupon_recommend_models/blob/main/coupon_recommend_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **In-vehicle Coupon Recommendation Models**
Importing the necessary libraries.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn import model_selection
from sklearn import linear_model
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt

## **Uploading the dataset as a csv file**

In [None]:
# from google.colab import files
# uploaded = files.upload()

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/in-vehicle-coupon-recommendation.csv')

## **Here we see the shape of the dataframe**. 

We have a **12684 x 26** table. 12684 entries, 26 attributes.

In [5]:
df.shape

(12684, 26)

In [6]:
df.head()


Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


With the use of **df.info()**, we can see that there are some missing. Car has only 108 non-null values, out of 12684. So we will drop this attribute for sure.

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  car                   108 non-null    object
 15  Bar                   12577 non-null

data cleansing based on:


*   Missing Values
<!-- *   Outliers
*   Duplicate Values -->



In [8]:
df.isnull().sum()

destination                 0
passanger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

In [None]:
df= df.drop("car", axis='columns')

In [None]:
df['Bar'].value_counts()

never    5197
less1    3482
1~3      2473
4~8      1076
gt8       349
Name: Bar, dtype: int64

In [None]:
df['Bar'].value_counts()

never    5304
less1    3482
1~3      2473
4~8      1076
gt8       349
Name: Bar, dtype: int64

# **Impute nan values**

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df['Bar'] = imputer.fit_transform(df['Bar'].values.reshape(-1,1))[:,0]

In [None]:
df['CoffeeHouse'].value_counts()

less1    3385
1~3      3225
never    2962
4~8      1784
gt8      1111
Name: CoffeeHouse, dtype: int64

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df['CoffeeHouse'] = imputer.fit_transform(df['CoffeeHouse'].values.reshape(-1,1))[:,0]

In [None]:
df['CarryAway'].value_counts()

1~3      4672
4~8      4258
less1    1856
gt8      1594
never     153
Name: CarryAway, dtype: int64

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df['CarryAway'] = imputer.fit_transform(df['CarryAway'].values.reshape(-1,1))[:,0]

In [None]:
df['RestaurantLessThan20'].value_counts()

1~3      5506
4~8      3580
less1    2093
gt8      1285
never     220
Name: RestaurantLessThan20, dtype: int64

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df['RestaurantLessThan20'] = imputer.fit_transform(df['RestaurantLessThan20'].values.reshape(-1,1))[:,0]

In [None]:
df['Restaurant20To50'].value_counts()

less1    6077
1~3      3290
never    2136
4~8       728
gt8       264
Name: Restaurant20To50, dtype: int64

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df['Restaurant20To50'] = imputer.fit_transform(df['Restaurant20To50'].values.reshape(-1,1))[:,0]

# **ONE-HOT ENCODING**

In [None]:
df['passanger'].value_counts()

Alone        7305
Friend(s)    3298
Partner      1075
Kid(s)       1006
Name: passanger, dtype: int64

In [None]:
# df= pd.get_dummies(df,columns=['passanger'])
# df

In [None]:
df_categorical=df.select_dtypes(exclude='number')

In [None]:
for value in df_categorical:
  df= pd.get_dummies(df,columns=[value])

In [None]:
df

Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,destination_Home,destination_No Urgent Place,...,RestaurantLessThan20_1~3,RestaurantLessThan20_4~8,RestaurantLessThan20_gt8,RestaurantLessThan20_less1,RestaurantLessThan20_never,Restaurant20To50_1~3,Restaurant20To50_4~8,Restaurant20To50_gt8,Restaurant20To50_less1,Restaurant20To50_never
0,55,1,1,0,0,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,0
1,80,1,1,0,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
2,80,1,1,1,0,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,0
3,80,1,1,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
4,80,1,1,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,55,0,1,0,0,1,0,1,1,0,...,0,1,0,0,0,1,0,0,0,0
12680,55,0,1,0,0,0,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
12681,30,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
12682,30,0,1,1,1,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [None]:
df.dtypes

temperature               int64
has_children              int64
toCoupon_GEQ5min          int64
toCoupon_GEQ15min         int64
toCoupon_GEQ25min         int64
                          ...  
Restaurant20To50_1~3      uint8
Restaurant20To50_4~8      uint8
Restaurant20To50_gt8      uint8
Restaurant20To50_less1    uint8
Restaurant20To50_never    uint8
Length: 110, dtype: object

In [None]:
df.describe()

Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,destination_Home,destination_No Urgent Place,...,RestaurantLessThan20_1~3,RestaurantLessThan20_4~8,RestaurantLessThan20_gt8,RestaurantLessThan20_less1,RestaurantLessThan20_never,Restaurant20To50_1~3,Restaurant20To50_4~8,Restaurant20To50_gt8,Restaurant20To50_less1,Restaurant20To50_never
count,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,...,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0
mean,63.301798,0.414144,1.0,0.561495,0.119126,0.214759,0.785241,0.568433,0.255203,0.495348,...,0.43409,0.282245,0.101309,0.165011,0.017345,0.259382,0.057395,0.020814,0.494008,0.168401
std,19.154486,0.492593,0.0,0.496224,0.32395,0.410671,0.410671,0.495314,0.435993,0.499998,...,0.495656,0.45011,0.301749,0.371205,0.130557,0.438313,0.232605,0.142766,0.499984,0.374237
min,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,55.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,80.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# count=1
# for i in df.isnull().sum():
#     if (i == 0):
#       print(count)
#       count+=1


In [None]:
# df.columns.is_floating

In [None]:
# df.iloc[:, 5:10].values

array([[0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1],
       [0, 1, 1, 0, 1],
       ...,
       [1, 0, 0, 1, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 0, 1, 0]])

Duplicate Values

In [None]:
df['temperature']

0        55
1        80
2        80
3        80
4        80
         ..
12679    55
12680    55
12681    30
12682    30
12683    80
Name: temperature, Length: 12684, dtype: int64

In [None]:
# df['temperature'].value_counts()

In [None]:
# df= pd.get_dummies(df,columns=['temperature'])

In [None]:
df

Unnamed: 0,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,destination_Home,destination_No Urgent Place,destination_Work,...,RestaurantLessThan20_less1,RestaurantLessThan20_never,Restaurant20To50_1~3,Restaurant20To50_4~8,Restaurant20To50_gt8,Restaurant20To50_less1,Restaurant20To50_never,temperature_30,temperature_55,temperature_80
0,1,1,0,0,0,1,1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
1,1,1,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,1,1,1,0,0,1,1,0,1,0,...,0,0,1,0,0,0,0,0,0,1
3,1,1,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
4,1,1,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,0,1,0,0,1,0,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
12680,0,1,0,0,0,1,1,0,0,1,...,0,0,1,0,0,0,0,0,1,0
12681,0,1,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
12682,0,1,1,1,0,1,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0


In [None]:
# #find duplicated values 
# df1 = df.append(df.iloc[20:30,:])

# #the number of duplicated values 
# df1.duplicated().sum()



In [None]:
# #remove duplicated values 
# df = df.drop_duplicates()

In [None]:
# #show the results
# df.duplicated().sum()

In [None]:
df.dtypes

has_children              int64
toCoupon_GEQ5min          int64
toCoupon_GEQ15min         int64
toCoupon_GEQ25min         int64
direction_same            int64
                          ...  
Restaurant20To50_less1    uint8
Restaurant20To50_never    uint8
temperature_30            uint8
temperature_55            uint8
temperature_80            uint8
Length: 112, dtype: object

In [None]:
# plt.figure(figsize=(25, 15))

# for i, column in enumerate(df.columns):
#   plt.subplot(4, 6, i + 1)
#   sns.boxplot(data=df[column])
#   plt.title(column)

# plt.tight_layout()
# plt.show()

#**Classification (Logistic Regression)**

In [None]:
X = df.drop('Y', axis=1)

In [None]:
y = df['Y']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [None]:
lr = linear_model.LogisticRegression()
lr_mod = lr.fit(X_train, y_train)

In [None]:
print(f"Baseline Logistic Regression: {round(lr_mod.score(X_test, y_test), 3)}")

Baseline Logistic Regression: 0.688


# **Classification (KNN Algorithm)**

In [None]:
classifier= KNeighborsClassifier(n_neighbors=2, metric='minkowski', p=2 )  
classifier.fit(X_train, y_train)  

KNeighborsClassifier(n_neighbors=2)

In [None]:
y_pred= classifier.predict(X_test) 

In [None]:
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(y_test, y_pred)  

In [None]:
# from matplotlib.colors import ListedColormap  
# x_set, y_set = X_train, y_train  
# x1, x2 = np.meshgrid(np.arange(start = x_set[:, 0].min() - 1, stop = x_set[:, 0].max() + 1, step  =0.01),  
# np.arange(start = x_set[:, 1].min() - 1, stop = x_set[:, 1].max() + 1, step = 0.01))  
# plt.contourf(x1, x2, classifier.predict(np.array([x1.ravel(), x2.ravel()]).T).reshape(x1.shape),  
# alpha = 0.75, cmap = ListedColormap(('red','green' )))  
# plt.xlim(x1.min(), x1.max())  
# plt.ylim(x2.min(), x2.max())  
# for i, j in enumerate(np.unique(y_set)):  
#     plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],  
#         c = ListedColormap(('red', 'green'))(i), label = j)  
# plt.title('K-NN Algorithm (Training set)')  
# plt.xlabel('Age')  
# plt.ylabel('Estimated Salary')  
# plt.legend()  
# plt.show()  

TypeError: ignored