## Mate Labs Assingnment

In [1]:
import numpy as np
import pandas as pd


***Import the datasets***

In [2]:
train = pd.read_csv('data/train.csv')
print(train.head())
campaign_data = pd.read_csv('data/campaign_data.csv')
print(campaign_data.head())
coupon_item_mapping = pd.read_csv('data/coupon_item_mapping.csv')
print(coupon_item_mapping.head())
customer_demographics = pd.read_csv('data/customer_demographics.csv')
print(customer_demographics.head())
customer_transaction_data = pd.read_csv('data/customer_transaction_data.csv')
print(customer_transaction_data.head())
item_data = pd.read_csv('data/item_data.csv')
print(item_data.head())

   id  campaign_id  coupon_id  customer_id  redemption_status
0   1           13         27         1053                  0
1   2           13        116           48                  0
2   6            9        635          205                  0
3   7           13        644         1050                  0
4   9            8       1017         1489                  0
   campaign_id campaign_type start_date  end_date
0           24             Y   21/10/13  20/12/13
1           25             Y   21/10/13  22/11/13
2           20             Y   07/09/13  16/11/13
3           23             Y   08/10/13  15/11/13
4           21             Y   16/09/13  18/10/13
   coupon_id  item_id
0        105       37
1        107       75
2        494       76
3        522       77
4        518       77
   customer_id age_range marital_status  rented family_size no_of_children  \
0            1       70+        Married       0           2            NaN   
1            6     46-55        Married 

### Data Cleaning and formatting
1. Drop the duplicates
2. Label Encoding
3. Feature formatting

In [3]:
print(train.shape)
dups = train.duplicated()
print(dups.any())

(78369, 5)
False


Drop the 'id' column in train

In [4]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


Check for all datasets, whether there is duplicate data

In [5]:
dups_campaign = campaign_data.duplicated()
print("campaign-data:",dups_campaign.any())
dups_coupon_item_mapping = coupon_item_mapping.duplicated()
print("coupon_item_mapping:", dups_coupon_item_mapping.any())
dups_customer_demographics = customer_demographics.duplicated()
print("customer_demographics:",dups_customer_demographics.any())
dups_customer_transaction_data=customer_transaction_data.duplicated()
print("customer_transaction_data:", dups_customer_transaction_data.any())
dups_item_data = item_data.duplicated()
print('item_data:', dups_item_data.any())

campaign-data: False
coupon_item_mapping: False
customer_demographics: False
customer_transaction_data: True
item_data: False


Now, remove the duplicates from Customer_transaction_data

In [6]:
print(customer_transaction_data.shape)
customer_transaction_data = customer_transaction_data.drop_duplicates()
print(customer_transaction_data.shape)
customer_transaction_data.head()

(1324566, 7)
(1321650, 7)


Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


***Label Encoding***

Converting the text into numerical values such that they become readable by machine.

Label encoding will happen on below columns-
- Item_data => brand_type and category
- Customer_demographics => mearital_status, age_range, family_size & #children
- Campaign_data => camapign_type

In [7]:
from sklearn import preprocessing

#Item Data
label_encoder = preprocessing.LabelEncoder() 
item_data['brand_type']= label_encoder.fit_transform(item_data['brand_type']) 
item_data['category'] = label_encoder.fit_transform(item_data['category'])
item_data.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,0,6
1,2,1,0,8
2,3,56,1,1
3,4,56,1,6
4,5,56,1,6


In [8]:
#Customer_Demographics
label_encoder = preprocessing.LabelEncoder() 
#customer_demographics['marital_status']= label_encoder.fit_transform(customer_demographics['marital_status']) 
customer_demographics['age_range'] = label_encoder.fit_transform(customer_demographics['age_range'])
customer_demographics['family_size']= label_encoder.fit_transform(customer_demographics['family_size']) 
#customer_demographics['no_of_children'] = label_encoder.fit_transform(customer_demographics['no_of_children'])

#As marital status and #children contains float and str, sklearn label encoder does not work. 
# Workaround- convert these into category types and use cat.codes accessor by pandas
customer_demographics['marital_status'] = customer_demographics['marital_status'].astype('category')
customer_demographics['marital_status'] = customer_demographics['marital_status'].cat.codes
customer_demographics['no_of_children'] = customer_demographics['no_of_children'].astype('category')
customer_demographics['no_of_children'] = customer_demographics['no_of_children'].cat.codes
customer_demographics.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,5,0,0,1,-1,4
1,6,3,0,0,1,-1,5
2,7,1,-1,0,2,0,3
3,8,1,-1,0,3,1,6
4,10,3,1,0,0,-1,5


In [9]:
#Campaign_data
campaign_data['campaign_type'] = label_encoder.fit_transform(campaign_data['campaign_type'])
campaign_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,1,21/10/13,20/12/13
1,25,1,21/10/13,22/11/13
2,20,1,07/09/13,16/11/13
3,23,1,08/10/13,15/11/13
4,21,1,16/09/13,18/10/13


***Data Feature Formatting***

Convert the time-series into Pandas date-time format

Data feature formatting will occur on these columns-

Customer_transaction_data => date_column
campaign_data => start_date and end_date

In [10]:
#Customer Transaction
customer_transaction_data['date']=pd.to_datetime(customer_transaction_data['date'],format='%Y-%m-%d')
customer_transaction_data['date_d']=customer_transaction_data['date'].dt.day.astype('category')
customer_transaction_data['date_m']=customer_transaction_data['date'].dt.month.astype('category')
customer_transaction_data['date_y']=customer_transaction_data['date'].dt.year.astype('category')
customer_transaction_data['date_w']=customer_transaction_data['date'].dt.week.astype('category')
customer_transaction_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,date_d,date_m,date_y,date_w
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,2,1,2012,1
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,2,1,2012,1
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0,2,1,2012,1
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,2,1,2012,1
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,2,1,2012,1


In [11]:
#Campaign Data
campaign_data['start_date']=pd.to_datetime(campaign_data['start_date'],format='%d/%m/%y')
campaign_data['sdate_d']=campaign_data['start_date'].dt.day.astype('category')
campaign_data['sdate_m']=campaign_data['start_date'].dt.month.astype('category')
campaign_data['sdate_y']=campaign_data['start_date'].dt.year.astype('category')
campaign_data['sdate_w']=campaign_data['start_date'].dt.week.astype('category')

campaign_data['end_date']=pd.to_datetime(campaign_data['end_date'],format='%d/%m/%y')
campaign_data['edate_d']=campaign_data['end_date'].dt.day.astype('category')
campaign_data['edate_m']=campaign_data['end_date'].dt.month.astype('category')
campaign_data['edate_y']=campaign_data['end_date'].dt.year.astype('category')
campaign_data['edate_w']=campaign_data['end_date'].dt.week.astype('category')

campaign_data.head()


Unnamed: 0,campaign_id,campaign_type,start_date,end_date,sdate_d,sdate_m,sdate_y,sdate_w,edate_d,edate_m,edate_y,edate_w
0,24,1,2013-10-21,2013-12-20,21,10,2013,43,20,12,2013,51
1,25,1,2013-10-21,2013-11-22,21,10,2013,43,22,11,2013,47
2,20,1,2013-09-07,2013-11-16,7,9,2013,36,16,11,2013,46
3,23,1,2013-10-08,2013-11-15,8,10,2013,41,15,11,2013,46
4,21,1,2013-09-16,2013-10-18,16,9,2013,38,18,10,2013,42


**Data Cleaning and formatting complete**


### Data Merging and wrangling

describe()- it computes several common aggregates for each column and returns the result

In [12]:
train.describe()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
count,78369.0,78369.0,78369.0,78369.0,78369.0
mean,64347.975449,13.974441,566.363243,787.451888,0.009302
std,37126.440855,8.019215,329.966054,456.811339,0.095999
min,1.0,1.0,1.0,1.0,0.0
25%,32260.0,8.0,280.0,399.0,0.0
50%,64318.0,13.0,597.0,781.0,0.0
75%,96577.0,13.0,857.0,1190.0,0.0
max,128595.0,30.0,1115.0,1582.0,1.0


In [13]:
campaign_data.describe()

Unnamed: 0,campaign_id,campaign_type
count,28.0,28.0
mean,15.571429,0.785714
std,9.118271,0.417855
min,1.0,0.0
25%,7.75,1.0
50%,16.5,1.0
75%,23.25,1.0
max,30.0,1.0


In [14]:
item_data.describe()

Unnamed: 0,item_id,brand,brand_type,category
count,74066.0,74066.0,74066.0,74066.0
mean,37033.5,1485.560055,0.151541,8.1611
std,21381.156856,1537.385673,0.358577,3.249951
min,1.0,1.0,0.0,0.0
25%,18517.25,278.0,0.0,6.0
50%,37033.5,978.0,0.0,6.0
75%,55549.75,2013.0,0.0,11.0
max,74066.0,5528.0,1.0,18.0


In [15]:
customer_transaction_data.describe()

Unnamed: 0,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
count,1321650.0,1321650.0,1321650.0,1321650.0,1321650.0,1321650.0
mean,803.9203,29517.25,130.8889,114.5666,-17.74348,-0.5748105
std,457.273,17907.28,1312.459,152.7346,37.82111,7.007641
min,1.0,1.0,1.0,0.36,-3120.31,-1992.23
25%,418.0,14683.0,1.0,49.51,-23.15,0.0
50%,801.0,26594.0,1.0,78.01,-1.78,0.0
75%,1197.0,42407.0,1.0,124.31,0.0,0.0
max,1582.0,74066.0,89638.0,17809.64,0.0,0.0


In [16]:
customer_demographics.describe()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
count,760.0,760.0,760.0,760.0,760.0,760.0,760.0
mean,779.201316,2.490789,-0.282895,0.053947,1.161842,-0.477632,4.715789
std,459.754429,1.281229,0.709597,0.226063,1.168929,0.932236,2.258817
min,1.0,0.0,-1.0,0.0,0.0,-1.0,1.0
25%,382.75,2.0,-1.0,0.0,0.0,-1.0,3.0
50%,774.5,3.0,0.0,0.0,1.0,-1.0,5.0
75%,1187.25,3.0,0.0,0.0,2.0,0.0,6.0
max,1581.0,5.0,1.0,1.0,4.0,2.0,12.0


Simple Merge - 
Train table with campaign_data 

Aggregate Merge - 
Item_data is generated from coupon_item_mapping parent table using coupon_id key.

In [17]:
#Simple Merge-

data_unmerged = train.copy()
    
#merge data to campaign Data many to 1 on campaign_id key (left join)
campaign_data_merge = pd.merge(train,campaign_data,on='campaign_id',how='left')
#coupon to item_data (many to 1) on item_id key (left join) - call coupon item 
coupon_to_item = pd.merge(coupon_item_mapping,item_data,on='item_id',how='left')

mode_fn = lambda x: pd.Series.mode(x)[0]

aggs= ['nunique',mode_fn]

coupon_to_item_agg = coupon_to_item.groupby(['coupon_id']).agg({'item_id':'count',
                                                           'brand':aggs,
                                                           'brand_type':aggs,
                                                           'category':aggs}).reset_index()

coupon_to_item_agg.columns = ['coupon_id','coupon_size','brand_nunique','brand_mode',
                             'brand_type_nunique','brand_type_mode',
                             'category_nunique','category_mode']

#Train to coupon item on coupon_id key (left join)
train = pd.merge(campaign_data_merge,coupon_to_item_agg,on='coupon_id',how='left')
#Train to customer demographics on customer_id key (left join)
train = pd.merge(train,customer_demographics,on='customer_id',how='left')

In [18]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,start_date,end_date,sdate_d,sdate_m,...,brand_type_nunique,brand_type_mode,category_nunique,category_mode,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,13,27,1053,0,0,2013-05-19,2013-07-05,19,5,...,1,0,1,6,3.0,-1.0,0.0,0.0,-1.0,5.0
1,2,13,116,48,0,0,2013-05-19,2013-07-05,19,5,...,1,1,1,6,2.0,0.0,0.0,1.0,-1.0,3.0
2,6,9,635,205,0,1,2013-03-11,2013-04-12,11,3,...,1,0,1,11,3.0,0.0,0.0,1.0,-1.0,7.0
3,7,13,644,1050,0,0,2013-05-19,2013-07-05,19,5,...,1,0,1,6,,,,,,
4,9,8,1017,1489,0,0,2013-02-16,2013-04-05,16,2,...,1,0,1,6,3.0,0.0,0.0,1.0,-1.0,3.0


In [19]:
customer_transaction_data.selling_price = customer_transaction_data.selling_price/customer_transaction_data.quantity
customer_transaction_data.other_discount = customer_transaction_data.other_discount/customer_transaction_data.quantity
customer_transaction_data['coupon_used'] = customer_transaction_data.coupon_discount.apply(lambda x: 1 if x !=0 else 0)

customer_transaction_data.selling_price = customer_transaction_data.selling_price - customer_transaction_data.other_discount
customer_transaction_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,date_d,date_m,date_y,date_w,coupon_used
0,2012-01-02,1501,26830,1,45.95,-10.69,0.0,2,1,2012,1,0
1,2012-01-02,1501,54253,1,67.32,-13.89,0.0,2,1,2012,1,0
2,2012-01-02,1501,31962,1,120.75,-14.25,0.0,2,1,2012,1,0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,2,1,2012,1,0
4,2012-01-02,1501,48199,1,99.38,-28.14,0.0,2,1,2012,1,0


In [20]:
# Aggregate transactions by item_id
transactions1 = pd.pivot_table(customer_transaction_data, index = "item_id", 
               values=['customer_id','quantity','selling_price', 'other_discount','coupon_discount','coupon_used'],
               aggfunc={'customer_id':lambda x: len(set(x)),
                        'quantity':np.mean,
                        'selling_price':np.mean,
                        'other_discount':np.mean,
                        'coupon_discount':np.mean,
                        'coupon_used': np.sum
                        } )
transactions1.reset_index(inplace=True)
transactions1.rename(columns={'customer_id': 'no_of_customers'}, inplace=True)
transactions2 = pd.pivot_table(customer_transaction_data, index = "item_id", 
               values=['customer_id','quantity','selling_price', 'other_discount','coupon_discount'],
               aggfunc={'customer_id':len,
                        'quantity':np.sum,
                        'selling_price':np.sum,
                        'other_discount':np.sum,
                        'coupon_discount':np.sum,
                        } )
transactions2.reset_index(inplace=True)
transactions2.rename(columns={'customer_id': 't_counts', 'quantity':'qu_sum',
                             'selling_price':'price_sum', 'other_discount':'od_sum',
                             'coupon_discount':'cd_sum'}, inplace=True)

In [21]:
transactions1 = pd.merge(transactions1, transactions2, on='item_id',how='left' )

In [22]:
transactions1['total_discount_mean'] = transactions1['coupon_discount'] + transactions1['other_discount']
transactions1['total_discount_sum'] = transactions1['od_sum'] + transactions1['cd_sum']
transactions1.head()

Unnamed: 0,item_id,coupon_discount,coupon_used,no_of_customers,other_discount,quantity,selling_price,cd_sum,t_counts,od_sum,qu_sum,price_sum,total_discount_mean,total_discount_sum
0,1,0.0,0,2,0.0,1.0,124.31,0.0,2,0.0,2,248.62,0.0,0.0
1,2,0.0,0,1,0.0,1.0,35.26,0.0,1,0.0,1,35.26,0.0,0.0
2,3,0.0,0,1,0.0,1.0,56.64,0.0,1,0.0,1,56.64,0.0,0.0
3,4,0.0,0,1,0.0,1.0,54.85,0.0,1,0.0,1,54.85,0.0,0.0
4,5,0.0,0,1,0.0,1.0,81.57,0.0,1,0.0,1,81.57,0.0,0.0


In [23]:
coupons_items = pd.merge(coupon_item_mapping, item_data, on="item_id", how="left")

In [24]:
item_coupon_trans = pd.merge(coupons_items, transactions1, on='item_id', how='left')

In [25]:
item_coupon_trans.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category,coupon_discount,coupon_used,no_of_customers,other_discount,quantity,selling_price,cd_sum,t_counts,od_sum,qu_sum,price_sum,total_discount_mean,total_discount_sum
0,105,37,56,1,6,0.0,0.0,2.0,-8.108333,2.285714,57.247381,0.0,7.0,-56.758333,16.0,400.731667,-8.108333,-56.758333
1,107,75,56,1,6,0.0,0.0,4.0,-2.633712,1.363636,38.828485,0.0,33.0,-86.9125,45.0,1281.34,-2.633712,-86.9125
2,494,76,209,0,6,0.0,0.0,1.0,-17.45,1.0,106.5,0.0,1.0,-17.45,1.0,106.5,-17.45,-17.45
3,522,77,278,0,6,0.0,0.0,2.0,-16.92,1.0,87.27,0.0,2.0,-33.84,2.0,174.54,-16.92,-33.84
4,518,77,278,0,6,0.0,0.0,2.0,-16.92,1.0,87.27,0.0,2.0,-33.84,2.0,174.54,-16.92,-33.84


In [26]:
item_coupon_trans.columns

Index(['coupon_id', 'item_id', 'brand', 'brand_type', 'category',
       'coupon_discount', 'coupon_used', 'no_of_customers', 'other_discount',
       'quantity', 'selling_price', 'cd_sum', 't_counts', 'od_sum', 'qu_sum',
       'price_sum', 'total_discount_mean', 'total_discount_sum'],
      dtype='object')

In [27]:
from scipy.stats import mode
coupons = pd.pivot_table(item_coupon_trans, index ="coupon_id",
                         values=[ 'item_id', 'brand', 'brand_type', 'category',
       'coupon_discount', 'coupon_used', 'no_of_customers', 'other_discount',
       'quantity', 'selling_price', 'cd_sum', 't_counts', 'od_sum', 'qu_sum',
       'price_sum', 'total_discount_mean', 'total_discount_sum'],
              aggfunc={'item_id':lambda x: len(set(x)),
                       'brand':lambda x: mode(x)[0][0],
                       'brand_type':lambda x: mode(x)[0][0],
                       'category':lambda x: mode(x)[0][0],
                       'coupon_discount':np.mean,
                       'no_of_customers':np.mean,
                       'other_discount':np.mean,
                       'quantity':np.mean,
                       'selling_price':np.mean,
                      'coupon_used': np.sum,
                       'cd_sum': np.sum,
                       't_counts': np.sum,
                       'od_sum': np.sum,
                       'qu_sum': np.sum,
                       'price_sum': np.sum,
                       'total_discount_mean': np.mean,
                       'total_discount_sum': np.sum
                      })
coupons.reset_index(inplace=True)

In [28]:
coupons.head()

Unnamed: 0,coupon_id,brand,brand_type,category,cd_sum,coupon_discount,coupon_used,item_id,no_of_customers,od_sum,other_discount,price_sum,qu_sum,quantity,selling_price,t_counts,total_discount_mean,total_discount_sum
0,1,1475,0,9,-1095.31,-1.119234,25.0,39,14.794872,-18550.460833,-16.547957,88649.393333,1018.0,1.219646,100.980451,815.0,-17.667191,-19645.770833
1,2,2084,0,6,0.0,0.0,0.0,2,15.0,-1163.521667,-21.343885,8940.52,103.0,1.1375,122.5345,81.0,-21.343885,-1163.521667
2,3,278,0,6,-543.19,-2.667646,14.0,17,8.588235,-4055.343333,-14.790729,27621.49,247.0,1.121632,131.655894,211.0,-17.458375,-4598.533333
3,4,544,0,6,-881.59,-1.485152,44.0,24,22.333333,-25895.74,-36.718597,142874.023333,702.0,1.020872,211.708369,676.0,-38.203749,-26777.33
4,5,5357,0,11,0.0,0.0,0.0,7,6.0,-1228.88,-27.265786,16636.57,44.0,1.0,403.97,44.0,-27.265786,-1228.88


In [29]:
coupons.rename(columns={'item_id':'item_counts'}, inplace=True)

In [30]:
coupons.head()

Unnamed: 0,coupon_id,brand,brand_type,category,cd_sum,coupon_discount,coupon_used,item_counts,no_of_customers,od_sum,other_discount,price_sum,qu_sum,quantity,selling_price,t_counts,total_discount_mean,total_discount_sum
0,1,1475,0,9,-1095.31,-1.119234,25.0,39,14.794872,-18550.460833,-16.547957,88649.393333,1018.0,1.219646,100.980451,815.0,-17.667191,-19645.770833
1,2,2084,0,6,0.0,0.0,0.0,2,15.0,-1163.521667,-21.343885,8940.52,103.0,1.1375,122.5345,81.0,-21.343885,-1163.521667
2,3,278,0,6,-543.19,-2.667646,14.0,17,8.588235,-4055.343333,-14.790729,27621.49,247.0,1.121632,131.655894,211.0,-17.458375,-4598.533333
3,4,544,0,6,-881.59,-1.485152,44.0,24,22.333333,-25895.74,-36.718597,142874.023333,702.0,1.020872,211.708369,676.0,-38.203749,-26777.33
4,5,5357,0,11,0.0,0.0,0.0,7,6.0,-1228.88,-27.265786,16636.57,44.0,1.0,403.97,44.0,-27.265786,-1228.88


In [31]:
# Aggregate transactions by customer_id
transactions3 = pd.pivot_table(customer_transaction_data, index = "customer_id", 
               values=['item_id','quantity','selling_price', 'other_discount','coupon_discount','coupon_used','date_d','date_w','date_m', 'date_y'],
               aggfunc={'item_id':lambda x: len(set(x)),
                        'quantity':np.mean,
                        'selling_price':np.mean,
                        'other_discount':np.mean,
                        'coupon_discount':np.mean,
                        'coupon_used': np.sum,
                        'date_d':lambda x: mode(x)[0][0],
                        'date_w':lambda x: mode(x)[0][0],
                        'date_m':lambda x: mode(x)[0][0],
                        'date_y':lambda x: mode(x)[0][0]}
              )
transactions3.reset_index(inplace=True)
transactions3.rename(columns={'item_id': 'no_of_items'}, inplace=True)
transactions3.head()

Unnamed: 0,customer_id,coupon_discount,coupon_used,date_d,date_m,date_w,date_y,no_of_items,other_discount,quantity,selling_price
0,1,-1.955631,76,3,5,19,2012,463,-12.750051,1.170172,97.327216
1,2,-0.595084,4,13,6,17,2012,352,-13.432195,1.131265,107.805783
2,3,-3.091546,53,16,7,38,2012,406,-14.074853,11.578723,85.082452
3,4,-0.404773,1,14,5,15,2012,125,-8.883656,1.272727,138.25677
4,5,-0.114684,2,11,5,21,2012,490,-11.260696,117.869949,115.482842


In [32]:
customer_transaction_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,date_d,date_m,date_y,date_w,coupon_used
0,2012-01-02,1501,26830,1,45.95,-10.69,0.0,2,1,2012,1,0
1,2012-01-02,1501,54253,1,67.32,-13.89,0.0,2,1,2012,1,0
2,2012-01-02,1501,31962,1,120.75,-14.25,0.0,2,1,2012,1,0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0,2,1,2012,1,0
4,2012-01-02,1501,48199,1,99.38,-28.14,0.0,2,1,2012,1,0


In [33]:
transactions4 = pd.pivot_table(customer_transaction_data, index = "customer_id", 
               values=['item_id','quantity','selling_price', 'other_discount','coupon_discount'],
               aggfunc={'item_id':len,
                        'quantity':np.sum,
                        'selling_price':np.sum,
                        'other_discount':np.sum,
                        'coupon_discount':np.sum}
              )
transactions4.reset_index(inplace=True)
transactions4.rename(columns={'item_id': 'customer_id_count','quantity':'qa_sum','selling_price':'pprice_sum',
                             'other_discount':'odd_sum','coupon_discount':'cdd_sum'  }, inplace=True)
transactions4.head()

Unnamed: 0,customer_id,cdd_sum,customer_id_count,odd_sum,qa_sum,pprice_sum
0,1,-2045.59,1046,-13336.553833,1224,101804.268
1,2,-249.34,419,-5628.089833,474,45170.623167
2,3,-2179.54,705,-9922.771654,8163,59983.128347
3,4,-89.05,220,-1954.404333,280,30416.489333
4,5,-90.83,792,-8918.471477,93353,91462.41062


In [34]:
transactions = pd.merge(transactions3, transactions4, on='customer_id', how='left')
transactions.head()

Unnamed: 0,customer_id,coupon_discount,coupon_used,date_d,date_m,date_w,date_y,no_of_items,other_discount,quantity,selling_price,cdd_sum,customer_id_count,odd_sum,qa_sum,pprice_sum
0,1,-1.955631,76,3,5,19,2012,463,-12.750051,1.170172,97.327216,-2045.59,1046,-13336.553833,1224,101804.268
1,2,-0.595084,4,13,6,17,2012,352,-13.432195,1.131265,107.805783,-249.34,419,-5628.089833,474,45170.623167
2,3,-3.091546,53,16,7,38,2012,406,-14.074853,11.578723,85.082452,-2179.54,705,-9922.771654,8163,59983.128347
3,4,-0.404773,1,14,5,15,2012,125,-8.883656,1.272727,138.25677,-89.05,220,-1954.404333,280,30416.489333
4,5,-0.114684,2,11,5,21,2012,490,-11.260696,117.869949,115.482842,-90.83,792,-8918.471477,93353,91462.41062


In [35]:
def merge_all(df): 
    df=  pd.merge(df, coupons, on="coupon_id", how="left")
    df = pd.merge(df, campaign_data, on="campaign_id", how="left")
    df = pd.merge(df, customer_demographics, on="customer_id", how="left")
    df = pd.merge(df, transactions, on='customer_id', how='left')
    return df

In [36]:
train_new = merge_all(train)
train_new.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type_x,start_date_x,end_date_x,sdate_d_x,sdate_m_x,...,date_y,no_of_items,other_discount_y,quantity_y,selling_price_y,cdd_sum,customer_id_count,odd_sum,qa_sum,pprice_sum
0,1,13,27,1053,0,0,2013-05-19,2013-07-05,19,5,...,2013,208,-25.583099,340.487097,163.966826,-89.05,310,-7930.760842,105551,50829.715972
1,2,13,116,48,0,0,2013-05-19,2013-07-05,19,5,...,2012,244,-19.871924,31.619792,188.703939,-1237.79,384,-7630.818702,12142,72462.312434
2,6,9,635,205,0,1,2013-03-11,2013-04-12,11,3,...,2012,533,-12.86423,1.392157,112.055027,-2101.2,969,-12465.439143,1349,108581.321349
3,7,13,644,1050,0,0,2013-05-19,2013-07-05,19,5,...,2012,216,-12.880868,1.291139,100.896997,-178.1,237,-3052.765833,306,23912.588333
4,9,8,1017,1489,0,0,2013-02-16,2013-04-05,16,2,...,2012,327,-12.264174,247.44306,85.016352,-265.01,562,-6892.466021,139063,47779.189609


### Data merge complete

***Removing extra columns from our train table***

In [37]:
train_new.isnull().sum(axis=0)

id                   0
campaign_id          0
coupon_id            0
customer_id          0
redemption_status    0
                    ..
cdd_sum              0
customer_id_count    0
odd_sum              0
qa_sum               0
pprice_sum           0
Length: 78, dtype: int64

In [38]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [39]:
train_new.shape

(78369, 78)

In [40]:
def deal_na(df):
    for col in train_new.columns.tolist()[1:]:
        df[col].fillna(mode(df[col]).mode[0], inplace=True)
    return df

train_new = deal_na(train_new)

In [41]:
train_new.isnull().sum(axis=0)

id                     0
campaign_id            0
coupon_id              0
customer_id            0
redemption_status      0
campaign_type_x        0
start_date_x           0
end_date_x             0
sdate_d_x              0
sdate_m_x              0
sdate_y_x              0
sdate_w_x              0
edate_d_x              0
edate_m_x              0
edate_y_x              0
edate_w_x              0
coupon_size            0
brand_nunique          0
brand_mode             0
brand_type_nunique     0
brand_type_mode        0
category_nunique       0
category_mode          0
age_range_x            0
marital_status_x       0
rented_x               0
family_size_x          0
no_of_children_x       0
income_bracket_x       0
brand                  0
brand_type             0
category               0
cd_sum                 0
coupon_discount_x      0
coupon_used_x          0
item_counts            0
no_of_customers        0
od_sum                 0
other_discount_x       0
price_sum              0


In [42]:
train_new.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type_x,start_date_x,end_date_x,sdate_d_x,sdate_m_x,sdate_y_x,sdate_w_x,edate_d_x,edate_m_x,edate_y_x,edate_w_x,coupon_size,brand_nunique,brand_mode,brand_type_nunique,brand_type_mode,category_nunique,category_mode,age_range_x,marital_status_x,rented_x,family_size_x,no_of_children_x,income_bracket_x,brand,brand_type,category,cd_sum,coupon_discount_x,coupon_used_x,item_counts,no_of_customers,od_sum,other_discount_x,price_sum,qu_sum,quantity_x,selling_price_x,t_counts,total_discount_mean,total_discount_sum,campaign_type_y,start_date_y,end_date_y,sdate_d_y,sdate_m_y,sdate_y_y,sdate_w_y,edate_d_y,edate_m_y,edate_y_y,edate_w_y,age_range_y,marital_status_y,rented_y,family_size_y,no_of_children_y,income_bracket_y,coupon_discount_y,coupon_used_y,date_d,date_m,date_w,date_y,no_of_items,other_discount_y,quantity_y,selling_price_y,cdd_sum,customer_id_count,odd_sum,qa_sum,pprice_sum
0,1,13,27,1053,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,125,2,1105,1,0,1,6,3.0,-1.0,0.0,0.0,-1.0,5.0,1105,0,6,-5221.7,-1.23375,185.0,125,19.224,-176951.985167,-44.375097,435019.921738,4647.0,1.169862,117.287462,3701.0,-45.608846,-182173.685167,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,3.0,-1.0,0.0,0.0,-1.0,5.0,-0.287258,1,11,5,24,2013,208,-25.583099,340.487097,163.966826,-89.05,310,-7930.760842,105551,50829.715972
1,2,13,116,48,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,3,1,56,1,1,1,6,2.0,0.0,0.0,1.0,-1.0,3.0,56,1,6,0.0,0.0,0.0,3,20.333333,-586.14,-10.207037,5323.51,86.0,1.050926,70.885046,75.0,-10.207037,-586.14,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,2.0,0.0,0.0,1.0,-1.0,3.0,-3.223411,12,4,6,6,2012,244,-19.871924,31.619792,188.703939,-1237.79,384,-7630.818702,12142,72462.312434
2,6,9,635,205,0,1,2013-03-11,2013-04-12,11,3,2013,11,12,4,2013,15,67,1,560,1,0,1,11,3.0,0.0,0.0,1.0,-1.0,7.0,560,0,11,-605.54,-2.987321,15.0,67,1.761194,-12475.083333,-76.767366,48980.998333,202.0,1.353802,342.826195,142.0,-79.754687,-13080.623333,1,2013-03-11,2013-04-12,11,3,2013,11,12,4,2013,15,3.0,0.0,0.0,1.0,-1.0,7.0,-2.168421,84,27,3,22,2012,533,-12.86423,1.392157,112.055027,-2101.2,969,-12465.439143,1349,108581.321349
3,7,13,644,1050,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,4,1,611,1,0,1,6,3.0,0.0,0.0,1.0,-1.0,5.0,611,0,6,-17.81,-4.4525,1.0,4,3.5,-260.75,-4.451964,3697.783333,47.0,1.11039,88.694903,39.0,-8.904464,-278.56,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,3.0,0.0,0.0,1.0,-1.0,5.0,-0.751477,2,9,8,45,2012,216,-12.880868,1.291139,100.896997,-178.1,237,-3052.765833,306,23912.588333
4,9,8,1017,1489,0,0,2013-02-16,2013-04-05,16,2,2013,7,5,4,2013,14,32,1,1558,1,0,1,6,3.0,0.0,0.0,1.0,-1.0,3.0,1558,0,6,-427.44,-0.155422,19.0,32,27.3125,-46585.176667,-30.656337,288028.478333,1601.0,1.070599,201.020184,1414.0,-30.811759,-47012.616667,0,2013-02-16,2013-04-05,16,2,2013,7,5,4,2013,14,3.0,0.0,0.0,1.0,-1.0,3.0,-0.471548,10,2,5,43,2012,327,-12.264174,247.44306,85.016352,-265.01,562,-6892.466021,139063,47779.189609


As it can be seen, columns like start_date_x, campaign_type_x & start_date_y , campaign_type_y are same, i.e. they contain same values. Therefore, I rename the _columns(with x) and then later drop the columns (with y).

In [43]:
train_new.rename(columns={'start_date_x': 'start_date', 'campaign_type_x':'campaign_type', 'end_date_x':'end_date',
                         'sdate_d_x':'sdate_d','sdate_m_x':'sdate_m','sdate_y_x':'sdate_y','sdate_w_x':'sdate_w',
                         'edate_d_x':'edate_d', 'edate_m_x':'edate_m','edate_y_x':'edate_y', 'edate_w_x':'edate_w',
                         'age_range_x':'age_range','marital_status_x':'marital_status','rented_x':'rented',
                         'family_size_x':'family_size', 'no_of_children_x':'no_of_children','income_bracket_x':'income_bracket',
                         'coupon_discount_x':'coupon_discount','coupon_used_x':'coupon_used', 'other_discount_x':'other_discount',
                         'quantity_x':'quantity','selling_price_x':'selling_price',}, inplace=True)

In [44]:
train_new.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,start_date,end_date,sdate_d,sdate_m,sdate_y,sdate_w,edate_d,edate_m,edate_y,edate_w,coupon_size,brand_nunique,brand_mode,brand_type_nunique,brand_type_mode,category_nunique,category_mode,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,brand_type,category,cd_sum,coupon_discount,coupon_used,item_counts,no_of_customers,od_sum,other_discount,price_sum,qu_sum,quantity,selling_price,t_counts,total_discount_mean,total_discount_sum,campaign_type_y,start_date_y,end_date_y,sdate_d_y,sdate_m_y,sdate_y_y,sdate_w_y,edate_d_y,edate_m_y,edate_y_y,edate_w_y,age_range_y,marital_status_y,rented_y,family_size_y,no_of_children_y,income_bracket_y,coupon_discount_y,coupon_used_y,date_d,date_m,date_w,date_y,no_of_items,other_discount_y,quantity_y,selling_price_y,cdd_sum,customer_id_count,odd_sum,qa_sum,pprice_sum
0,1,13,27,1053,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,125,2,1105,1,0,1,6,3.0,-1.0,0.0,0.0,-1.0,5.0,1105,0,6,-5221.7,-1.23375,185.0,125,19.224,-176951.985167,-44.375097,435019.921738,4647.0,1.169862,117.287462,3701.0,-45.608846,-182173.685167,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,3.0,-1.0,0.0,0.0,-1.0,5.0,-0.287258,1,11,5,24,2013,208,-25.583099,340.487097,163.966826,-89.05,310,-7930.760842,105551,50829.715972
1,2,13,116,48,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,3,1,56,1,1,1,6,2.0,0.0,0.0,1.0,-1.0,3.0,56,1,6,0.0,0.0,0.0,3,20.333333,-586.14,-10.207037,5323.51,86.0,1.050926,70.885046,75.0,-10.207037,-586.14,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,2.0,0.0,0.0,1.0,-1.0,3.0,-3.223411,12,4,6,6,2012,244,-19.871924,31.619792,188.703939,-1237.79,384,-7630.818702,12142,72462.312434
2,6,9,635,205,0,1,2013-03-11,2013-04-12,11,3,2013,11,12,4,2013,15,67,1,560,1,0,1,11,3.0,0.0,0.0,1.0,-1.0,7.0,560,0,11,-605.54,-2.987321,15.0,67,1.761194,-12475.083333,-76.767366,48980.998333,202.0,1.353802,342.826195,142.0,-79.754687,-13080.623333,1,2013-03-11,2013-04-12,11,3,2013,11,12,4,2013,15,3.0,0.0,0.0,1.0,-1.0,7.0,-2.168421,84,27,3,22,2012,533,-12.86423,1.392157,112.055027,-2101.2,969,-12465.439143,1349,108581.321349
3,7,13,644,1050,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,4,1,611,1,0,1,6,3.0,0.0,0.0,1.0,-1.0,5.0,611,0,6,-17.81,-4.4525,1.0,4,3.5,-260.75,-4.451964,3697.783333,47.0,1.11039,88.694903,39.0,-8.904464,-278.56,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,3.0,0.0,0.0,1.0,-1.0,5.0,-0.751477,2,9,8,45,2012,216,-12.880868,1.291139,100.896997,-178.1,237,-3052.765833,306,23912.588333
4,9,8,1017,1489,0,0,2013-02-16,2013-04-05,16,2,2013,7,5,4,2013,14,32,1,1558,1,0,1,6,3.0,0.0,0.0,1.0,-1.0,3.0,1558,0,6,-427.44,-0.155422,19.0,32,27.3125,-46585.176667,-30.656337,288028.478333,1601.0,1.070599,201.020184,1414.0,-30.811759,-47012.616667,0,2013-02-16,2013-04-05,16,2,2013,7,5,4,2013,14,3.0,0.0,0.0,1.0,-1.0,3.0,-0.471548,10,2,5,43,2012,327,-12.264174,247.44306,85.016352,-265.01,562,-6892.466021,139063,47779.189609


In [45]:
train_fin = train_new.drop(['start_date_y', 'campaign_type_y','end_date_y', 'sdate_d_y', 'sdate_m_y', 'sdate_y_y','sdate_w_y','edate_d_y','edate_m_y','edate_y_y','edate_w_y','age_range_y','marital_status_y','rented_y',
                         'family_size_y', 'no_of_children_y','income_bracket_y','coupon_discount_y','coupon_used_y', 'other_discount_y',
                         'quantity_y','selling_price_y'],axis = 1)

I make a new table- 'train_fin' which does not contain the extra columns as in 'train_new'.

In [46]:
train_fin.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,start_date,end_date,sdate_d,sdate_m,sdate_y,sdate_w,edate_d,edate_m,edate_y,edate_w,coupon_size,brand_nunique,brand_mode,brand_type_nunique,brand_type_mode,category_nunique,category_mode,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,brand_type,category,cd_sum,coupon_discount,coupon_used,item_counts,no_of_customers,od_sum,other_discount,price_sum,qu_sum,quantity,selling_price,t_counts,total_discount_mean,total_discount_sum,date_d,date_m,date_w,date_y,no_of_items,cdd_sum,customer_id_count,odd_sum,qa_sum,pprice_sum
0,1,13,27,1053,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,125,2,1105,1,0,1,6,3.0,-1.0,0.0,0.0,-1.0,5.0,1105,0,6,-5221.7,-1.23375,185.0,125,19.224,-176951.985167,-44.375097,435019.921738,4647.0,1.169862,117.287462,3701.0,-45.608846,-182173.685167,11,5,24,2013,208,-89.05,310,-7930.760842,105551,50829.715972
1,2,13,116,48,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,3,1,56,1,1,1,6,2.0,0.0,0.0,1.0,-1.0,3.0,56,1,6,0.0,0.0,0.0,3,20.333333,-586.14,-10.207037,5323.51,86.0,1.050926,70.885046,75.0,-10.207037,-586.14,4,6,6,2012,244,-1237.79,384,-7630.818702,12142,72462.312434
2,6,9,635,205,0,1,2013-03-11,2013-04-12,11,3,2013,11,12,4,2013,15,67,1,560,1,0,1,11,3.0,0.0,0.0,1.0,-1.0,7.0,560,0,11,-605.54,-2.987321,15.0,67,1.761194,-12475.083333,-76.767366,48980.998333,202.0,1.353802,342.826195,142.0,-79.754687,-13080.623333,27,3,22,2012,533,-2101.2,969,-12465.439143,1349,108581.321349
3,7,13,644,1050,0,0,2013-05-19,2013-07-05,19,5,2013,20,5,7,2013,27,4,1,611,1,0,1,6,3.0,0.0,0.0,1.0,-1.0,5.0,611,0,6,-17.81,-4.4525,1.0,4,3.5,-260.75,-4.451964,3697.783333,47.0,1.11039,88.694903,39.0,-8.904464,-278.56,9,8,45,2012,216,-178.1,237,-3052.765833,306,23912.588333
4,9,8,1017,1489,0,0,2013-02-16,2013-04-05,16,2,2013,7,5,4,2013,14,32,1,1558,1,0,1,6,3.0,0.0,0.0,1.0,-1.0,3.0,1558,0,6,-427.44,-0.155422,19.0,32,27.3125,-46585.176667,-30.656337,288028.478333,1601.0,1.070599,201.020184,1414.0,-30.811759,-47012.616667,2,5,43,2012,327,-265.01,562,-6892.466021,139063,47779.189609


***Removing the na values***

In [47]:
train_fin.isnull().sum(axis=0)

id                     0
campaign_id            0
coupon_id              0
customer_id            0
redemption_status      0
campaign_type          0
start_date             0
end_date               0
sdate_d                0
sdate_m                0
sdate_y                0
sdate_w                0
edate_d                0
edate_m                0
edate_y                0
edate_w                0
coupon_size            0
brand_nunique          0
brand_mode             0
brand_type_nunique     0
brand_type_mode        0
category_nunique       0
category_mode          0
age_range              0
marital_status         0
rented                 0
family_size            0
no_of_children         0
income_bracket         0
brand                  0
brand_type             0
category               0
cd_sum                 0
coupon_discount        0
coupon_used            0
item_counts            0
no_of_customers        0
od_sum                 0
other_discount         0
price_sum              0


In [48]:
train_fin.shape

(78369, 56)

As it is seen, na values are present in the Customer Demographics table's columns. I remove these values.

In [49]:
def deal_na(df):
    for col in customer_demographics.columns.tolist()[1:]:
        df[col].fillna(mode(df[col]).mode[0], inplace=True)
    return df

train_fin = deal_na(train_fin)

In [50]:
train_fin.columns

Index(['id', 'campaign_id', 'coupon_id', 'customer_id', 'redemption_status', 'campaign_type', 'start_date', 'end_date', 'sdate_d', 'sdate_m', 'sdate_y', 'sdate_w', 'edate_d', 'edate_m', 'edate_y', 'edate_w', 'coupon_size', 'brand_nunique', 'brand_mode', 'brand_type_nunique', 'brand_type_mode', 'category_nunique', 'category_mode', 'age_range', 'marital_status', 'rented', 'family_size', 'no_of_children', 'income_bracket', 'brand', 'brand_type', 'category', 'cd_sum', 'coupon_discount', 'coupon_used', 'item_counts', 'no_of_customers', 'od_sum', 'other_discount', 'price_sum', 'qu_sum', 'quantity', 'selling_price', 't_counts', 'total_discount_mean', 'total_discount_sum', 'date_d', 'date_m', 'date_w', 'date_y', 'no_of_items', 'cdd_sum', 'customer_id_count', 'odd_sum', 'qa_sum', 'pprice_sum'], dtype='object')

***Now the dataset is fit for further processing***

Target column => redemption_status

And I drop the columns start_date, end_date because these contain the dates not in the Pandas format

In [51]:
target = train_fin['redemption_status']
train_fin.drop(['id','campaign_id','start_date','end_date', 'redemption_status'], axis=1, inplace=True)

In [52]:
train_fin.columns

Index(['coupon_id', 'customer_id', 'campaign_type', 'sdate_d', 'sdate_m', 'sdate_y', 'sdate_w', 'edate_d', 'edate_m', 'edate_y', 'edate_w', 'coupon_size', 'brand_nunique', 'brand_mode', 'brand_type_nunique', 'brand_type_mode', 'category_nunique', 'category_mode', 'age_range', 'marital_status', 'rented', 'family_size', 'no_of_children', 'income_bracket', 'brand', 'brand_type', 'category', 'cd_sum', 'coupon_discount', 'coupon_used', 'item_counts', 'no_of_customers', 'od_sum', 'other_discount', 'price_sum', 'qu_sum', 'quantity', 'selling_price', 't_counts', 'total_discount_mean', 'total_discount_sum', 'date_d', 'date_m', 'date_w', 'date_y', 'no_of_items', 'cdd_sum', 'customer_id_count', 'odd_sum', 'qa_sum', 'pprice_sum'], dtype='object')

I override the categorical features in the dataset by cat_feat.

In [53]:
cat_feat = ['customer_id','coupon_id', 'campaign_type', 'date_d', 'date_w', 'date_m','brand', 'brand_type',
       'category','rented' , 'age_range', 'marital_status']

## LightGBM model

In [54]:
import lightgbm as lgb
train_data = lgb.Dataset(data=train_fin, label=target, free_raw_data=False)
evals_result = {}
def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        params = {'application':'binary',#'num_iterations': 1500,
                  'learning_rate':0.05,
                  'metric':'auc'} # ,'boost_from_average':False}
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=5, seed=11,
                           stratified=True,categorical_feature=cat_feat,
                            verbose_eval =None)
        return max(cv_result['auc-mean'])

### Parameter Fine-Tuning-
***To deal with over-fitting:***
- Use small max_bin
- Use small num_leaves
- Use min_data_in_leaf and min_sum_hessian_in_leaf
- Use bagging by set bagging_fraction and bagging_freq
- Use feature sub-sampling by set feature_fraction
- Use bigger training data
- Try lambda_l1, lambda_l2 and min_gain_to_split to regularization
- Try max_depth to avoid growing deep tree

In [55]:
pbs = {'num_leaves': (60, 130),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.8, 1),
        'max_depth': (7, 16),
        'lambda_l1': (0, 2),
        'lambda_l2': (0, 3),
        'min_split_gain': (0.001, 0.1),
        'min_child_weight': (1, 10)
      }

In [56]:
from bayes_opt import BayesianOptimization
optimizer = BayesianOptimization(lgb_eval, pbs, random_state=109)
optimizer.maximize(init_points=5, n_iter=10)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------


New categorical_feature is ['age_range', 'brand', 'brand_type', 'campaign_type', 'category', 'coupon_id', 'customer_id', 'date_d', 'date_m', 'date_w', 'marital_status', 'rented']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


| [0m 1       [0m | [0m 0.9391  [0m | [0m 0.9223  [0m | [0m 0.4931  [0m | [0m 1.399   [0m | [0m 0.5837  [0m | [0m 13.97   [0m | [0m 7.371   [0m | [0m 0.03714 [0m | [0m 128.0   [0m |
| [95m 2       [0m | [95m 0.9394  [0m | [95m 0.8186  [0m | [95m 0.431   [0m | [95m 0.1221  [0m | [95m 0.8978  [0m | [95m 15.96   [0m | [95m 9.292   [0m | [95m 0.0458  [0m | [95m 124.3   [0m |
| [95m 3       [0m | [95m 0.9404  [0m | [95m 0.8583  [0m | [95m 0.5483  [0m | [95m 0.9183  [0m | [95m 0.05462 [0m | [95m 15.54   [0m | [95m 3.797   [0m | [95m 0.08432 [0m | [95m 61.57   [0m |
| [0m 4       [0m | [0m 0.9389  [0m | [0m 0.9162  [0m | [0m 0.5833  [0m | [0m 1.696   [0m | [0m 0.9446  [0m | [0m 15.49   [0m | [0m 7.668   [0m | [0m 0.09032 [0m | [0m 120.7   [0m |
| [0m 5       [0m | [0m 0.9395  [0m | [0m 0.9802  [0m | [0m 0.6788  [0m | [0m 1.054   [0m | [0m 2.295   [0m | [0m 8.989   [0m | [0m 7.737   [0m | [0m 0.077

In [57]:
optimizer.max

{'target': 0.9438845964819034,
 'params': {'bagging_fraction': 0.8713551779328061,
  'feature_fraction': 0.7591279725513654,
  'lambda_l1': 0.13140495522080675,
  'lambda_l2': 0.4171599003374772,
  'max_depth': 12.0071693353583,
  'min_child_weight': 1.9146943918851527,
  'min_split_gain': 0.019509528103029156,
  'num_leaves': 70.6474910338319}}

In [58]:
p =optimizer.max['params']

In [59]:
param = {'num_leaves': int(round(p['num_leaves'])),
         'feature_fraction': p['feature_fraction'],
         'bagging_fraction': p['bagging_fraction'],
         'max_depth': int(round(p['max_depth'])),
         'lambda_l1': p['lambda_l1'],
         'lambda_l2':p['lambda_l2'],
         'min_split_gain': p['min_split_gain'],
         'min_child_weight': p['min_child_weight'],
         'learing_rate':0.01,
         'objective': 'binary',
         'boosting_type': 'gbdt',
         'verbose': 1,
         'metric': {'auc'},
         'is_unbalance': True,
         'boost_from_average': False}

In [60]:
param

{'num_leaves': 71,
 'feature_fraction': 0.7591279725513654,
 'bagging_fraction': 0.8713551779328061,
 'max_depth': 12,
 'lambda_l1': 0.13140495522080675,
 'lambda_l2': 0.4171599003374772,
 'min_split_gain': 0.019509528103029156,
 'min_child_weight': 1.9146943918851527,
 'learing_rate': 0.01,
 'objective': 'binary',
 'boosting_type': 'gbdt',
 'verbose': 1,
 'metric': {'auc'},
 'is_unbalance': True,
 'boost_from_average': False}

In [61]:
from sklearn.model_selection import train_test_split,KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [62]:
train_fin.columns


Index(['coupon_id', 'customer_id', 'campaign_type', 'sdate_d', 'sdate_m', 'sdate_y', 'sdate_w', 'edate_d', 'edate_m', 'edate_y', 'edate_w', 'coupon_size', 'brand_nunique', 'brand_mode', 'brand_type_nunique', 'brand_type_mode', 'category_nunique', 'category_mode', 'age_range', 'marital_status', 'rented', 'family_size', 'no_of_children', 'income_bracket', 'brand', 'brand_type', 'category', 'cd_sum', 'coupon_discount', 'coupon_used', 'item_counts', 'no_of_customers', 'od_sum', 'other_discount', 'price_sum', 'qu_sum', 'quantity', 'selling_price', 't_counts', 'total_discount_mean', 'total_discount_sum', 'date_d', 'date_m', 'date_w', 'date_y', 'no_of_items', 'cdd_sum', 'customer_id_count', 'odd_sum', 'qa_sum', 'pprice_sum'], dtype='object')

In [63]:
%%time
nfold = 10

skf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=27)

oof = np.zeros(len(train_fin))
predictions = np.zeros(len(train_fin))

i = 1
for train_index, valid_index in skf.split(target, target.values):
    print("fold {}".format(i))
    xg_train = lgb.Dataset(train_fin.iloc[train_index],
                           label=target[train_index],
                           free_raw_data = False
                           )
    xg_valid = lgb.Dataset(train_fin.iloc[valid_index],
                           label=target[valid_index],
                           free_raw_data = False
                           )   
    
    clf = lgb.train(param, xg_train, 5000, valid_sets = [xg_valid],
                    categorical_feature=cat_feat,
                    verbose_eval=100, early_stopping_rounds = 100)
    
    oof[valid_index] = clf.predict(train_fin.iloc[valid_index], num_iteration=clf.best_iteration) 
    
    predictions += clf.predict(train_fin, num_iteration=clf.best_iteration) / nfold
    i = i + 1

print("\n\nCV AUC: {:<0.4f}".format(roc_auc_score(target, (oof))))

fold 1


New categorical_feature is ['age_range', 'brand', 'brand_type', 'campaign_type', 'category', 'coupon_id', 'customer_id', 'date_d', 'date_m', 'date_w', 'marital_status', 'rented']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.931913
Early stopping, best iteration is:
[3]	valid_0's auc: 0.934708
fold 2
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.923308
Early stopping, best iteration is:
[77]	valid_0's auc: 0.927848
fold 3
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.944669
Early stopping, best iteration is:
[70]	valid_0's auc: 0.947903
fold 4
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.904115
Early stopping, best iteration is:
[85]	valid_0's auc: 0.907315
fold 5
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.940955
Early stopping, best iteration is:
[75]	valid_0's auc: 0.941961
fold 6
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.939409
Early stopping, best iteration is:
[30]	valid_0's auc: 0.948726
fold 7
Training until validati

In [64]:
predictions

array([0.04875744, 0.03997937, 0.04629876, ..., 0.14176322, 0.03998466,
       0.0404323 ])