# Objective
    1- EDA (Exploratory Data Analysis)
    2- Feature Selection
    3- Modeling and Evaluation
    4- cConclusion

## 

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import os
import matplotlib.pyplot as plt
%matplotlib inline



# 1. EDA (Exploratory Data Analysis)

In [3]:
# load event dataset.
events_df=pd.read_csv('/kaggle/input/ecommerce-dataset/events.csv')
events_df.head(10)

The timestamp portion is in Unix format 

Visitor Id is the unique user currently browsing the website

Event is what the user is currently doing in that current timestamp

Transaction ID will only have value if the user made a purchase as shown below, Else it will be Null

In [4]:
# view the number of data
print("This data contains ",events_df.shape[0], "cases of user activities in e-commerce website")

In [5]:
events_df.info()

In [6]:
events_df.describe()

In [7]:
# view missing values in each columns
events_df.isnull().value_counts()

### Except in 'transactionid', there are no missing values.

In [8]:
# view counts by event type
events_df.event.value_counts()

In [9]:
# see if there is any missing transaction id when event is 'transaction'
events_df.transactionid[events_df.event=='transaction'].isnull().value_counts()

### All transaction events have transaction ID.

In [10]:
# remove duplicates if any. (all columns' values are idential)
print('Number of rows before removing duplicates: ', events_df.shape[0])
msk=events_df.duplicated()
events_df=events_df[~msk]
events_df.reset_index(drop=True, inplace=True)
print('Number of rows after removing duplicates: ', events_df.shape[0])

### As dates in current dataset is in unix timestamp, I'll convert them into readable dates.
## Ex ('1433221999827') = '2015-06-02 05:13:19'GMT

In [11]:
tz = float('1433221999827')
new_time = datetime.datetime.fromtimestamp(tz/1000)
new_time.strftime('%Y-%m-%d %H:%M:%S')

In [12]:
# convert unix timestamp to readable dates (GMT)
# separate the 'ordinary' timestamp and the milliseconds
list=[]
for i, unix in enumerate(events_df['timestamp']):
    timestamp, ms = divmod(unix, 1000)

    # create the datetime from the timestamp 
    # add the milliseconds separately
    dt = datetime.datetime.fromtimestamp(timestamp) + datetime.timedelta(milliseconds=ms)

    formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    list.append(formatted_time)

events_df['date_time']=pd.DataFrame({'date_time':list})
events_df.head()


In [13]:
print('Start Date of Dataset: ', events_df['date_time'].min())
print('End Date of Dataset: ', events_df['date_time'].max())

### This dataset is recorded from 3rd May, 2015 to 18th September, 2015 (GMT).

In [14]:
# distribution by event 
# plot
totalcases=events_df.shape[0]

fig, (ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
ax1.hist(events_df['event'], bins=3)
ax1.set_xlabel('Type of Events')
ax1.set_ylabel('Number of Events (Unit: million)')
ax1.set_title('Distribution by Event Type')
for x, counts in zip(events_df['event'].unique().tolist(), events_df['event'].value_counts().tolist()):
    text=str(round((counts/totalcases)*100,2)) +'%'
    ax1.text(x, counts, text, fontsize=12)
    
# unique number of visitors by event
for event in events_df['event'].unique().tolist():
    height=len(events_df['visitorid'][events_df['event']==event].unique())
    ax2.bar(x=event, height=height)
    
    text=len(events_df['visitorid'][events_df['event']==event].unique())/len(events_df['visitorid'][events_df['event']==event])
    text=str(round(text*100,2)) + '%'
    ax2.text(event, height, text, fontsize=12)
ax2.set_title('Unique Number of Visitors')
plt.show()

### When analyzing the distribution of events, 'View' occupies 96.67%, 'Add to cart' 2.52%, 'Transaction' 0.81%.
### The number of unique visitors was around the half of total number in all three types of event, for example, total view is 2.6 million, but unique visitor for the views is the half, which means that on average one visitor caused 2 actions.

## Q1. How many times did a customer view an item before making purchase decision?

In [15]:
# obtain vistor id,item id, and date time of 'tranaction'
item_tra=events_df[['visitorid','itemid','date_time']][events_df['event']=='transaction']
# obtain vistor id,item id, and date time of 'add to cart'
item_atc=events_df[['visitorid','itemid','date_time']][events_df['event']=='addtocart']
# obtain vistor id,item id, and date time of 'view'
item_viw=events_df[['visitorid','itemid','date_time']][events_df['event']=='view']

In [16]:
events_df.head()

In [17]:
# create a dataframe of visitor, itemid found in all three events
m=item_tra.merge(item_atc, how='inner', on=['visitorid','itemid'], suffixes=[' (transaction)', ' (add_to_cart)'])
m=m.merge(item_viw, how='inner', on=['visitorid','itemid'])
m=m.rename(columns={'date_time':'date_time (view)'})
m.head()

### 'date_time (transaction)'

In [18]:
# convert datatype of the date columns
m['date_time (transaction)']=pd.to_datetime(m['date_time (transaction)'])
m['date_time (add_to_cart)']=pd.to_datetime(m['date_time (add_to_cart)'])
m['date_time (view)']=pd.to_datetime(m['date_time (view)'])

In [19]:
# the following dataframe is about the cases that a visitor viewed an item before making purchase decision

# find rows with time difference is larger than 0 minute
msk=(m['date_time (transaction)']-m['date_time (view)'])>np.timedelta64(0,'m')
m1=m[msk]

### In the dataframe above, multiple views are mixed in for one transaction, which means there are cases that a visitor checked the item multiple times. I'll check both timelines: one is from the first item view to transaction, and other is from the last item view to transaction.

In [20]:
# seperate multiple-view transaction and single-view transaction

# using duplicated function
mul_viw=m1.duplicated(subset=['visitorid','itemid'], keep=False)
sig_viw=m1[~mul_viw]
mul_viw=m1[mul_viw]

# last occurrence in duplicates = view when a visitor made purchase
# therefore, filter dataframe to leave only views before last occurence
notlast=mul_viw.duplicated(subset=['visitorid','itemid'], keep='last')  # except for last occurance

# count the number of view
# obtain the average number of it
avg_viw=mul_viw[notlast].groupby(['visitorid','itemid']).count()['date_time (view)'].mean()
print('Average Number of Views Before Purchase: {0:.0f}'.format(avg_viw))

In [21]:
# view basic statistics to see outlier
n_view_df = mul_viw[notlast].groupby(['visitorid','itemid']).count()['date_time (view)']
n_view_df.describe()

### There are some outliers such as 844 views for buying a single item, and hence I will check the distribution of view counts to have a better idea about general purchase cases. Then choose calaculte the mode to have a better insight

In [22]:
n_view_df.mode()

### we can see that mode value = 1 and thats mean most people view only once befor buying a singe item

In [23]:
# count the number of view using groupby function
n_viw=mul_viw[notlast].groupby(['visitorid','itemid']).count()['date_time (view)']
n_viw=pd.DataFrame(n_viw)
n_viw.head(15)

### The first row in the dataframe above indicates that Visitor(id=172) had viewed the item(id=10034) four times before resolving to purchase. The second row is about another item the same visitor purchased and this time, the visitor viewed that item five times.  

In [24]:
# create a dataframe with the count of view from the previous cell 
n_viw2=pd.DataFrame(n_viw.value_counts(), columns=['count'])
n_viw2=n_viw2.reset_index()  

# add a row with the value of instant purchase  
# item view at the time of transaction counted as '0' here
n_viw2.loc[len(n_viw2)]=['0',sig_viw.shape[0]]

# add the column of share in percentage
n_viw2=n_viw2.sort_values('count', ascending=False)
n_viw2['(%)']=((n_viw2['count']/(n_viw2['count'].sum()))*100).round(2)
n_viw2=n_viw2.reset_index(drop=True)
n_viw2

In [25]:
# aggregate values lower than 5th highest values into 'others'
n_viw3=n_viw2[0:5].copy()
others=n_viw2[5::].sum().tolist()
others[0]='others'
n_viw3.loc[len(n_viw3)]=others 

# set new index
n_viw3['index']=['instant purchase', 'view 1', 'view 2','view 3', 'view 5', 'others']
n_viw3=n_viw3.set_index(['index'])

In [26]:
# generate a pie plot of share by number of view before transaciton
n_viw3.plot(y='(%)', kind='pie', autopct='%1.1f%%',  shadow=True, startangle=-90, legend=False, figsize=(8,8), fontsize=20)
plt.title('The Number of Item Views Before Purchase Decision', fontsize=20, pad=20)
plt.ylabel('')
plt.show()

### When analysing the item view numbers, I found 50.4% of transactions were made without more-than-once view: a visitor checked an item, added to cart and checked out. 
### About 30% of transactions were made after a buyer viewed  an item once or twice.
### In summary, 80% of total transactions were made after less-than-three-times item views. 

## Q2. How many times does it take for one single transaction to be made?

In [27]:
# single view
# calcuate the average time period for a visitor to take for purchase
diff_s=sig_viw['date_time (transaction)']-sig_viw['date_time (view)']
avg_time=(diff_s).mean()
totalsec=avg_time.seconds
hrs, remainder = divmod(totalsec,3600)
mins, sec = divmod(remainder,60)
print('Time Period From Single View to Transaction: %s hour %s mininutes %s seconds \n' % (hrs,mins,sec))

# multiple view
# calcuate the average time period for a visitor to take for purchase

# sort values
mul_viw=mul_viw.sort_values(['visitorid','itemid','date_time (view)'])
mul_viw=mul_viw.reset_index(drop=True)

# initial view to transaction
notinitial=mul_viw.duplicated(subset=['visitorid','itemid'], keep='first') # except for first occurance
mul_viw1=mul_viw[~notinitial] # only first occurence

# get the time difference
diff_m1=mul_viw1['date_time (transaction)']-mul_viw1['date_time (view)']
avg_time1=(diff_m1).mean()
totaldays1=avg_time1.days 
totalsec1=avg_time1.seconds
hrs1, remainder1 = divmod(totalsec1,3600)
mins1, sec1 = divmod(remainder1,60)
print('Time Period From Multiple View (initial) to Transaction: %s days %s hours %s mininutes %s seconds' % (totaldays1,hrs1,mins1,sec1))

# last view to transaction
notlast=mul_viw.duplicated(subset=['visitorid','itemid'], keep='last')  # except for last occurance
mul_viw2=mul_viw[~notlast] # only last occurence

# get the time difference
diff_m2=mul_viw2['date_time (transaction)']-mul_viw2['date_time (view)']
avg_time2=(diff_m2).mean()
totaldays2=avg_time2.days
totalsec2=avg_time2.seconds
hrs2, remainder2 = divmod(totalsec2,3600)
mins2, sec2 = divmod(remainder2,60)
print('Time Period From Multiple View (last) to Transaction: %s days %s hours %s mininutes %s seconds' % (totaldays2,hrs2,mins2,sec2))

In [28]:
def quan_list(percentile):
    list=[diff_s.quantile(percentile).seconds,diff_m1.quantile(percentile).seconds,diff_m2.quantile(percentile).seconds]
    return list 

In [29]:
def time_cal(column):
    hr, remainder=divmod(column, 3600)
    mins,sec=divmod(remainder, 60)
    list=[]
    for i in range(len(column)):
        t='%sh%sm%ss'%(hr[i],mins[i],sec[i])
        t=str(t)
        list.append(str(t))
    return list

In [30]:
q1=quan_list(.25)
q2=quan_list(.50)
q3=quan_list(.75)

data={'Transaction Type':['Single View','Multiple Views(initial)','Multiple Views(last)'],\
     '25th percentile': q1,\
     '50th percentile': q2,\
     '75th percentile': q3}
data=pd.DataFrame(data)
data=data.set_index('Transaction Type')
t_data=data.transpose()

q1time=time_cal(t_data['Single View'])
q2time=time_cal(t_data['Multiple Views(initial)'])
q3time=time_cal(t_data['Multiple Views(last)'])

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(10,8))

# plot the same data on both axes
data.plot(ax=ax1, kind='bar')
data.plot(ax=ax2, kind='bar')

# zoom-in / limit the view to different portions of the data
ax1.set_ylim(30000, 35000)  # outliers only
ax2.set_ylim(0, 2000)  # most of the data

# hide the spines between ax and ax2
ax1.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax1.xaxis.tick_top()
ax1.tick_params(labeltop='off')  # don't put tick labels at the top
ax2.xaxis.tick_bottom()
ax2.set_xticklabels(ax2.get_xticklabels(), rotation = 45, ha="right", fontsize=15)
ax2.legend('')

# This looks pretty good, and was fairly painless, but you can get that
# cut-out diagonal lines look with just a bit more work. The important
# thing to know here is that in axes coordinates, which are always
# between 0-1, spine endpoints are at these locations (0,0), (0,1),
# (1,0), and (1,1).  Thus, we just need to put the diagonals in the
# appropriate corners of each of our axes, and so long as we use the
# right transform and disable clipping.

d = .015  # how big to make the diagonal lines in axes coordinates
# arguments to pass to plot, just so we don't keep repeating them
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((-d, +d), (-d, +d), **kwargs)        # top-left diagonal
ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # top-right diagonal

kwargs.update(transform=ax2.transAxes)  # switch to the bottom axes
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)  # bottom-left diagonal
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # bottom-right diagonal

x=[-0.4,-0.1,0]
for x,y,qt in zip(x,t_data['Single View'],q1time):
    plt.text(x, y+500, qt, fontsize=12)
x=[0.6,0.9,1]
for x,y,qt in zip(x,t_data['Multiple Views(initial)'],q2time):
    if x<1:
        plt.text(x, y+200, qt, fontsize=12)
    else:
        plt.text(x, 4000, qt, fontsize=12)
x=[1.6,1.9,2]
for x,y,qt in zip(x,t_data['Multiple Views(last)'],q3time):
    plt.text(x, y+500, qt, fontsize=12 )
ax1.set_ylabel('time period (second)')
ax2.set_ylabel('time period (second)')
ax1.set_title('Time taken to make purchase decision', pad=50, fontsize=20)
plt.show()

### In case of single-view transactions, they generally take around 3 to 11 minutes to reach the end of buying journey (view to transaction). On the other hand, multiple-view transactions showed more dispersed time range, about 9 minutes to 9.5 hours with 30 minutes as a median. However, when multiple-view buyers checked the product for the last time before purchase, they took similar short amount of time to single-view buyers.

## Q3. Is there any relationship between Item Category and Transaction?

In [31]:
# load item-related datasets 
cate=pd.read_csv('/kaggle/input/ecommerce-dataset/category_tree.csv')
item1=pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part1.csv')
item2=pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part2.csv')

In [32]:
# view category tree dataset
cate.head()

In [33]:
len(cate.categoryid.unique()) 

In [34]:
len(cate.parentid.unique()) 

### 'Category ID' is a subset or child of 'Parent Id'.

In [35]:
cate.info()

In [36]:
# view item dataset 1
item1.head()

In [37]:
item1.property.value_counts()

### Only'categoryid' and 'available' were given in the column 'property' while the rest are hashed for confidentiality purpose. I will use only categoryid and hence leave only the rows with that.

In [38]:
item1=item1[item1.property=='categoryid']  
item1.reset_index(drop=True, inplace=True)
item1.shape

In [39]:
# do the same with item2
item2=item2[item2.property=='categoryid']
item2.reset_index(drop=True, inplace=True)
item2.shape

In [40]:
# drop column, timestamp and property
item1=item1.drop(columns=['timestamp','property'])
item2=item2.drop(columns=['timestamp','property'])
item1.head()

In [41]:
# stack two item dataframes
item=pd.concat([item1, item2], ignore_index=True)
item.shape

In [42]:
# reset index
item.reset_index(drop=True, inplace=True)

# rename column name 'value' to 'categoryid'
item.rename(columns={'value':'categoryid'}, inplace=True)
item.head()

In [43]:
# check duplitcates 
item.duplicated().value_counts()

In [44]:
# drop duplicates
msk=item.duplicated()
item=item[~msk] 

In [45]:
# count the number of category id by item id
(item.groupby('itemid').count().categoryid>1).value_counts()

In [46]:
# number of category id  
# create a dataframe
num_cate=item.groupby('itemid').count()
num_cate.reset_index(inplace=True)
num_cate.rename(columns={'categoryid':'num_categoryid'}, inplace=True)
num_cate.head()

In [47]:
cate.info()

In [48]:
item.info()

In [49]:
item.categoryid=item.categoryid.astype(int)

In [50]:
# merge item dataframe with category tree to link item id with parent id
item=item.merge(cate, how='inner', on='categoryid')
item.head()

In [51]:
# create a dataframe with 'itemid' and 'parent id'
ip=item[['itemid','parentid']]

# check duplicates
ip.duplicated().value_counts()

In [52]:
# remove duplicates
msk=ip.duplicated()
ip=ip[~msk]

# view the number of parent ids by item id
ip.groupby('itemid').count()

In [53]:
# number of parent id
# create a dataframe
num_pare=ip.groupby('itemid').count()
num_pare.reset_index(inplace=True)
num_pare.rename(columns={'parentid':'num_parentid'}, inplace=True)

In [54]:
num_pare

In [55]:
# call dataframe that includes 'transaction' information. 
item_tra.shape

In [56]:
item_viw.reset_index(drop=True, inplace=True)

In [57]:
item_viw

In [58]:
msk=item_viw.duplicated(['visitorid','itemid'])
item_viw=item_viw[~msk]

In [59]:
# visitor id and item id that didn't lead to transaction
nta=pd.concat([item_viw,item_tra], ignore_index=True)
msk=nta.duplicated(['visitorid','itemid'], keep=False)
nta=nta[~msk]
nta.reset_index(drop=True, inplace=True)
nta

In [60]:
# randomly select 25000 observation from nta(no-transaction) dataframe above
index=np.random.choice(nta.index, 25000)
nta_r=nta.loc[index].reset_index(drop=True)
nta_r.head()

### choosing 25,000 samples only from the non- transaction records (under sampling technique to not deal with unbalanced data)

In [61]:
# add a new column
nta_r['purchase']=[0]*nta_r.shape[0]
nta_r.head()

In [62]:
nta_r.shape

In [63]:
# add a new column
item_tra['purchase']=[1]*item_tra.shape[0]
item_tra.head()

In [64]:
item_tra.reset_index(drop=True, inplace=True)

In [65]:
item_tra.shape

In [66]:
# concatenate transaction data and non-transaction data (the cases that a visitor after all didn't buy the item viewed during this data collection period)
data=pd.concat([nta_r,item_tra], ignore_index=True).sort_values('date_time').reset_index(drop=True)
data.head()

In [67]:
data.shape

In [68]:
data.purchase.value_counts()

In [69]:
# merge with category id, parent id dataframe
data=data.merge(num_cate, how='inner', on='itemid')
data=data.merge(num_pare, how='inner', on='itemid')

In [70]:
data.groupby('num_categoryid').sum().purchase

In [71]:
data.groupby('num_parentid').sum().purchase

### I had assumed that the more Category ID or Parent ID an item has, the more transactions it might have as the item could have exposed in multiple category pages. However, when I checked the distribution of data, the number of items with multiple categories was not many to be used as feature in Machine learning.

In [72]:
# pick items under one category and view each sales number.
cate1=data[data.num_categoryid==1]
cate1=cate1.merge(item[['itemid','categoryid']], how='inner', on=['itemid'])
cate1_sales=cate1.groupby('categoryid').sum().purchase

# plot the sales number along the category id. 
pd.DataFrame(cate1_sales).plot()
plt.ylabel('purchase')
plt.show()

In [73]:
# pick items under one category and view each sales number.
cate1=data[data.num_categoryid==2]
cate1=cate1.merge(item[['itemid','categoryid']], how='inner', on=['itemid'])
cate1_sales=cate1.groupby('categoryid').sum().purchase

# plot the sales number along the category id. 
pd.DataFrame(cate1_sales).plot()
plt.ylabel('purchase')
plt.show()

### There is no general trend in the number of sales along category id.

In [74]:
# pick items under one parent id and view each sales number.
pare1=data[data.num_parentid==1]
pare1=pare1.merge(item[['itemid','parentid']], how='inner', on=['itemid'])
pare1_sales=pare1.groupby('parentid').sum().purchase

# plot the sales number along the parent id.
pd.DataFrame(pare1_sales).plot()
plt.ylabel('purchase')
plt.show()

### There is no general trend in the number of sales along parent id.

In [75]:
# pick items under one parent id and view each sales number.
pare1=data[data.num_parentid==2]
pare1=pare1.merge(item[['itemid','parentid']], how='inner', on=['itemid'])
pare1_sales=pare1.groupby('parentid').sum().purchase

# plot the sales number along the parent id.
pd.DataFrame(pare1_sales).plot()
plt.ylabel('purchase')
plt.show()

In [76]:
# the item distribution by cateogory id
item.categoryid.hist()
plt.show()

In [77]:
# the item distribution by parent id
item.parentid.hist()
plt.show()

### There was no clear relationship between category and transaction (sales). 

## Q4. Is there any relationship between View Time and Transaction?

In [78]:
# sort data by date time an in ascending order. 
data=data.sort_values('date_time').reset_index(drop=True)
data.head()

In [79]:
# convert data type of the column, date_time to use date_time functions.
data.date_time=pd.to_datetime(data.date_time)
data.info()

In [80]:
# extract days of week of each date and add them into a new column.
data['dayofweek']=data.date_time.dt.dayofweek

In [81]:
# count the number of date by day of week.
data.groupby('dayofweek').count()

In [82]:
# view how many purchase there are by day of week.
data[data.purchase==1].groupby('dayofweek').count()

In [83]:
# compare the number of purchase with that of non-purchase by day of week.
plt.plot(data[data.purchase==0].groupby('dayofweek').count(), color='red')
plt.plot(data[data.purchase==1].groupby('dayofweek').count(), color='blue')
plt.legend(['No purchase','Purchase'], labelcolor=['red','blue'])
plt.show()

In [84]:
# extract hour and week number and add them to a new column, respectively
data['hour']=data.date_time.dt.hour
data['week']=data.date_time.dt.isocalendar().week 

In [85]:
# convert data type
item_viw['date_time']=pd.to_datetime(item_viw['date_time'])
item_viw.info()

In [86]:
# convert data type
item_tra['date_time']=pd.to_datetime(item_tra['date_time'])
item_tra.info()

In [87]:
# count each visitor's previous view of each item
list=[]
for i in range(len(data)):
    row=item_viw[(item_viw['visitorid']==data.loc[i,'visitorid'])&(item_viw['itemid']==data.loc[i,'itemid'])&(item_viw['date_time']<data.loc[i,'date_time'])]

    if len(row)==0:
        list.append(0)

    else:
        list.append(len(row))
list[0:10]

In [88]:
# add a new column, 'previous view'
data['previous_view']=pd.DataFrame({'view_count':list})
data.head()

In [89]:
data.previous_view.value_counts()

In [90]:
# count each visitor's total number of transaction in the past
list2=[]
for i in range(len(data)):
    row=item_tra[(item_tra['visitorid']==data.loc[i,'visitorid'])&(item_tra['date_time']<data.loc[i,'date_time'])]

    if len(row)==0:
        list2.append(0)

    else:
        list2.append(len(row))
list2[0:10]

In [91]:
# add a new column, 'previous transaction'
data['previous_transaction']=pd.DataFrame({'previous_transaction':list2})

In [92]:
data.previous_transaction.describe()

In [93]:
# export the csv file
data.to_csv('data_for_modeling.csv', index=False)

In [94]:
data.head(10)

In [95]:
data.corr()

In [96]:
data.shape

In [97]:
plt.plot(data[['dayofweek','purchase']].groupby('dayofweek').sum())
plt.xticks(np.arange(7),['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], rotation=45)
plt.ylabel('Purchase')
plt.show()

In [98]:
plt.plot(data[['hour','purchase']].groupby('hour').sum())
plt.ylabel('Hour')
plt.ylabel('Purchase')
plt.show()

In [99]:
plt.plot(data[['week','purchase']].groupby('week').sum())
plt.xlabel('Week')
plt.ylabel('Purchase')
plt.show()

In [100]:
plt.plot(data[['previous_view','purchase']].groupby('previous_view').sum())
plt.xlabel('Number of previous item view (current)')
plt.ylabel('Purchase')
plt.show()

In [101]:
plt.plot(data[['previous_transaction','purchase']].groupby('previous_transaction').sum())
plt.xlim(0,5)
plt.xlabel("Number of a visitor's previous transaction (total)")
plt.ylabel('Purchase')
plt.show()

### choosen feature: dayofweek, hour, previous view, previous transaction 

### Recommendation system

### we can offer them a list of what previous visitors bought together with the item they are currently viewing

In [102]:
# Firstly let's create an array that lists visitors who made a purchase
customer_purchased = events_df[events_df.transactionid.notnull()].visitorid.unique()
    
purchased_items = []
    
# Create another list that contains all their purchases 
for customer in customer_purchased:
    #Generate a Pandas series type object containing all the visitor's purchases and put them in the list
    purchased_items.append(events_df.loc[(events_df.visitorid == customer) & (events_df.transactionid.notnull())].itemid.values.tolist())                                  

### So now all items purchased together are presented as a list of lists, shown below are the first 5 samples

In [103]:
purchased_items[:5]

In [104]:
# Write a function that would show items that were bought together (same of different dates) by the same customer
def recommender(item_id, purchased_items):
    
    # Perhaps implement a binary search for that item id in the list of arrays
    # Then put the arrays containing that item id in a new list
    # Then merge all items in that list and get rid of duplicates
    recommender_list = []
    for x in purchased_items:
        if item_id in x:
            recommender_list += x
    
    #Then merge recommender list and remove the item id
    recommender_list = set(recommender_list) - set([item_id])
    
    return recommender_list

### So now we can present to the visitor a list of the other items a customer previously bought along with what item the current visitor is viewing e.g. item number 302422

In [105]:
recommender(302422, purchased_items)

## 2. Machine Learning Classification Modeling

In [106]:
from sklearn import preprocessing 

In [107]:
# convert features to numpy array
X=data[['dayofweek','hour','previous_view','previous_transaction']].values
X[0:5]

In [108]:
modeling_data = data[['dayofweek','hour','previous_view','previous_transaction']]

In [109]:
modeling_data.head()

In [110]:
# convert a target values to numpy array
y=data['purchase'].values
y[0:5]

## Step 01. Normalize data

In [111]:
X=preprocessing.StandardScaler().fit(X).transform(X.astype(float))
X[0:5]

## Step 02. Train/Test Split

In [112]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
print('Train Set: ', X_train.shape, y_train.shape)
print('Test Set: ', X_test.shape, y_test.shape)

## Step 03-2. Decision Tree Algorithm

In [113]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [118]:
from sklearn import metrics

In [114]:
param_grid = { 'criterion':['gini','entropy'],'max_depth': np.arange(2, 14, 2)}
dtree_model=DecisionTreeClassifier()
dtree_gscv = GridSearchCV(dtree_model, param_grid)
dtree_gscv.fit(X_train, y_train)

In [115]:
dtree_gscv.best_params_

In [122]:

PurchaseTree=DecisionTreeClassifier(criterion='gini', max_depth=2)
PurchaseTree.fit(X_train, y_train)
yhat=PurchaseTree.predict(X_test)
print(predTree[0:5])
print(y_test[0:5])

In [123]:
# evaluation

print("Decision Tree's Accuracy: {0:.2f}%".format(metrics.accuracy_score(y_test, yhat)*100))

In [124]:
f1_score(y_test, yhat, average='weighted')

In [125]:
# evaluation

print("Decision Tree's Accuracy: {0:.2f}%".format(metrics.accuracy_score(y_test, yhat)*100))

In [126]:
# evaluation

# visualize 'Confusion Matrix'
import itertools

def plot_confusion_matrix (cm, classes, 
                         normalize=False, 
                         title='Confusion matrix',
                         cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be appliced by setting 'normalize=True'.
    """
    
    if normalize:
        cm=cm.astype('float')/cm.sum(axis=1)[:,np.newaxis]
        print('Normalized confusion matrix')
    else:
        print('Confusion matrix, without normalization')
        
    print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks=np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt='.2f' if normalize else 'd'
    thresh=cm.max()/2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,i, format(cm[i,j], fmt),
                 horizontalalignment='center',
                 color='white' if cm[i,j]> thresh else 'black')
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# compute confusion matrix
cnf_matrix=metrics.confusion_matrix(y_test, yhat, labels=[1,0])
np.set_printoptions(precision=2)

# plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Purchase=1','No Purchase=0'], normalize=False, title='Confusion matrix')


# classification report
print('\nClassification Report\n',metrics.classification_report(y_test, yhat))


# jaccard score
print('Jaccard Score: ', metrics.jaccard_score(y_test,yhat))




## Step 03-3. Logistic Regression Algorithm

In [127]:
from sklearn.linear_model import LogisticRegression

# 'c' parameter = inverse of regularization (the smaller, the stronger regularization)
LR=LogisticRegression(C=0.01, solver='liblinear').fit(X_train, y_train)
yhat=LR.predict(X_test)
yhat_proba=LR.predict_proba(X_test)
yhat_proba[0:10]      

In [128]:
# evaluation

# visualize 'Confusion Matrix'
import itertools

def plot_confusion_matrix (cm, classes, 
                         normalize=False, 
                         title='Confusion matrix',
                         cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be appliced by setting 'normalize=True'.
    """
    
    if normalize:
        cm=cm.astype('float')/cm.sum(axis=1)[:,np.newaxis]
        print('Normalized confusion matrix')
    else:
        print('Confusion matrix, without normalization')
        
    print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks=np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt='.2f' if normalize else 'd'
    thresh=cm.max()/2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,i, format(cm[i,j], fmt),
                 horizontalalignment='center',
                 color='white' if cm[i,j]> thresh else 'black')
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# compute confusion matrix
cnf_matrix=metrics.confusion_matrix(y_test, yhat, labels=[1,0])
np.set_printoptions(precision=2)

# plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Purchase=1','No Purchase=0'], normalize=False, title='Confusion matrix')


# classification report
print('\nClassification Report\n',metrics.classification_report(y_test, yhat))


# jaccard score
print('Jaccard Score: ', metrics.jaccard_score(y_test,yhat))

# logloss
print('\nLogloss: ',metrics.log_loss(y_test, yhat_proba)) 
# more ideal classifiers have progressively smaller values of log loss.


In [129]:
#ToDO
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(X_train, y_train)
yhat=PurchaseTree.predict(X_test)

In [130]:
# evaluation

# visualize 'Confusion Matrix'
import itertools

def plot_confusion_matrix (cm, classes, 
                         normalize=False, 
                         title='Confusion matrix',
                         cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be appliced by setting 'normalize=True'.
    """
    
    if normalize:
        cm=cm.astype('float')/cm.sum(axis=1)[:,np.newaxis]
        print('Normalized confusion matrix')
    else:
        print('Confusion matrix, without normalization')
        
    print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks=np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt='.2f' if normalize else 'd'
    thresh=cm.max()/2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,i, format(cm[i,j], fmt),
                 horizontalalignment='center',
                 color='white' if cm[i,j]> thresh else 'black')
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# compute confusion matrix
cnf_matrix=metrics.confusion_matrix(y_test, yhat, labels=[1,0])
np.set_printoptions(precision=2)

# plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Purchase=1','No Purchase=0'], normalize=False, title='Confusion matrix')


# classification report
print('\nClassification Report\n',metrics.classification_report(y_test, yhat))


# jaccard score
print('Jaccard Score: ', metrics.jaccard_score(y_test,yhat))




# At conclusion we can increase revenue by 
    1- We can increase the probability of purchase decision by giving offers for those who has predicted probability near 50% 
    2- we also can increase revenue by recommending similar products 
    3- Making offers in the days which has less number of purchase
