In [1]:
# Calling the necessary libraries
import pandas as pd
pd.set_option('display.max_columns', None)
from mlxtend.frequent_patterns import apriori, association_rules

### 1. Business Problem
Armut, Turkey's leading online service platform, brings together service providers and service seekers. The aim of the study is to determine customer trends by using association rule analysis on a data set containing the services and categories preferred by customers and to suggest new services that may attract the attention of similar users.

### 2. Dataset Story
The dataset comprises information about the services customers have acquired and the respective service categories. Additionally, it includes the date and time details for each service received.

### 3. Features
+ UserId - Distinct customer identifier
+ ServiceId - Anonymized services associated with each category. For instance, within the cleaning category, a service could be upholstery cleaning. ServiceId may appear across different categories, signifying diverse services under distinct categories or a service with CategoryId = 7 and ServiceId = 4 could be radiator cleaning, whereas a service with CategoryId = 2 and ServiceId = 4 might be furniture assembly
+ CategoryId - Anonymized categories. For instance; cleaning, moving, renovation
+ CreateDate - The date on which the service was purchased

### 4.Databehandling

In [2]:
df_ = pd.read_csv("armut_data.csv")
df = df_.copy()

In [3]:
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate
0,25446,4,5,2017-08-06 16:11:00
1,22948,48,5,2017-08-06 16:12:00
2,10618,0,8,2017-08-06 16:13:00
3,7256,9,4,2017-08-06 16:14:00
4,25446,48,5,2017-08-06 16:16:00


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162523 entries, 0 to 162522
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   UserId      162523 non-null  int64 
 1   ServiceId   162523 non-null  int64 
 2   CategoryId  162523 non-null  int64 
 3   CreateDate  162523 non-null  object
dtypes: int64(3), object(1)
memory usage: 5.0+ MB


In [5]:
df.shape

(162523, 4)

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UserId,162523.0,13089.803862,7325.81606,0.0,6953.0,13139.0,19396.0,25744.0
ServiceId,162523.0,21.64114,13.774405,0.0,13.0,18.0,32.0,49.0
CategoryId,162523.0,4.325917,3.129292,0.0,1.0,4.0,6.0,11.0


#### 4.1 Data Preprocessing

In [7]:
# ServiceID represents a different service for each CategoryID. 
# Creating a new variable to represent services by combining ServiceID and CategoryID with “_”.

df["Service"] = [str(row[1]) + "_" + str(row[2]) for row in df.values ]
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service
0,25446,4,5,2017-08-06 16:11:00,4_5
1,22948,48,5,2017-08-06 16:12:00,48_5
2,10618,0,8,2017-08-06 16:13:00,0_8
3,7256,9,4,2017-08-06 16:14:00,9_4
4,25446,48,5,2017-08-06 16:16:00,48_5


In [8]:
#The data set consists of the date and time when the services were received, there is no basket definition (invoice etc.). 
#In order to apply Association Rule Learning, a basket (invoice etc.) definition must be created. 
#Here, the basket definition is the monthly services received by each customer. 
#For example; 9_4, 46_4 services received by the customer with id 7256 in the 8th month of 2017 
#represent one basket; 9_4, 38_4 services received in the 10th month of 2017 represent another basket. 
#Therefore, baskets should be defined with a unique ID. 

# create a date variable containing only year and month
df["CreateDate"] = pd.to_datetime(df["CreateDate"])
df["New_Date"] = df["CreateDate"].dt.strftime("%Y-%m")

# merging userids with the newly created variable
df["SepetID"] = [str(row[0]) + "_" + str(row[5]) for row in df.values]

df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,New_Date,SepetID
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08,25446_2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08,22948_2017-08
2,10618,0,8,2017-08-06 16:13:00,0_8,2017-08,10618_2017-08
3,7256,9,4,2017-08-06 16:14:00,9_4,2017-08,7256_2017-08
4,25446,48,5,2017-08-06 16:16:00,48_5,2017-08,25446_2017-08


In [9]:
# Create invoices pivot table
invoice_product_df = df.groupby(["SepetID", "Service"])["Service"].count().unstack().fillna(0).applymap(
    lambda x:1 if x > 0 else 0)
invoice_product_df.head()

Service,0_8,10_9,11_11,12_7,13_11,14_7,15_1,16_8,17_5,18_4,19_6,1_4,20_5,21_5,22_0,23_10,24_10,25_0,26_7,27_7,28_4,29_0,2_0,30_2,31_6,32_4,33_4,34_6,35_11,36_1,37_0,38_4,39_10,3_5,40_8,41_3,42_1,43_2,44_0,45_6,46_4,47_7,48_5,49_1,4_5,5_11,6_7,7_3,8_5,9_4
SepetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
0_2017-08,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
0_2017-09,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
0_2018-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
0_2018-04,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
10000_2017-08,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [10]:
# Find frequent item sets using Apriori algorithm with a minimum support of 0.01
frequent_itemsets = apriori(invoice_product_df,
                            min_support=0.01,
                            use_colnames=True)



In [11]:
#Establishment of the association rule
rules = association_rules(frequent_itemsets,
                          metric="support",
                          min_threshold=0.01)

In [12]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(2_0),(13_11),0.130286,0.056627,0.012819,0.098394,1.737574,0.005442,1.046325,0.488074
1,(13_11),(2_0),0.056627,0.130286,0.012819,0.226382,1.737574,0.005442,1.124216,0.449965
2,(2_0),(15_1),0.130286,0.120963,0.033951,0.260588,2.154278,0.018191,1.188833,0.616073
3,(15_1),(2_0),0.120963,0.130286,0.033951,0.280673,2.154278,0.018191,1.209066,0.609539
4,(33_4),(15_1),0.02731,0.120963,0.011233,0.411311,3.400299,0.007929,1.493211,0.725728


In [13]:
def arl_recommender(rules_df, product_id, rec_count=1):
    # Sorting the rules from largest to smallest according to lift
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    # create a blank list of products to be recommended
    recommendation_list = [] 
    #Iterate over the antecedents (X) in the sorted rules
    for i, product in sorted_rules["antecedents"].items():
        for j in list(product): 
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"]))
               
   # Use the unique property of the dictionary structure to avoid duplication in the recommendation list
    recommendation_list = list({item for item_list in recommendation_list for item in item_list})
    return recommendation_list[:rec_count]

In [14]:
arl_recommender(rules, "2_0", 3)

['15_1', '13_11', '25_0']