In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv('../input/market-basket-analysis/Assignment-1_Data.csv', sep=';')
df.head()

Unnamed: 0,BillNo,Itemname,Quantity,Date,Price,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01.12.2010 08:26,255,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01.12.2010 08:26,339,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,01.12.2010 08:26,275,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,01.12.2010 08:26,339,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,01.12.2010 08:26,339,17850.0,United Kingdom


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522064 entries, 0 to 522063
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   BillNo      522064 non-null  object 
 1   Itemname    520609 non-null  object 
 2   Quantity    522064 non-null  int64  
 3   Date        522064 non-null  object 
 4   Price       522064 non-null  object 
 5   CustomerID  388023 non-null  float64
 6   Country     522064 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 27.9+ MB


In [4]:
if df.isna().sum().sum() > 0:
    df = df.dropna()


df['Price'] = df['Price'].str.replace(',', '.').astype('float64')
df['CustomerID'] = df['CustomerID'].astype('int')
df['Date'] = pd.to_datetime(df['Date'])
df['Itemname'] = df['Itemname'].str.strip()
df['Total_Price'] = df.Quantity * df.Price

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 388023 entries, 0 to 522063
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   BillNo       388023 non-null  object        
 1   Itemname     388023 non-null  object        
 2   Quantity     388023 non-null  int64         
 3   Date         388023 non-null  datetime64[ns]
 4   Price        388023 non-null  float64       
 5   CustomerID   388023 non-null  int64         
 6   Country      388023 non-null  object        
 7   Total_Price  388023 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(3)
memory usage: 26.6+ MB


In [7]:
country = input(" Write the country of the customer: ")
ID = int(input(" Write the customer's ID number: "))

def hot_encode(x):
    if(x<= 0):
        return 0
    if(x>= 1):
        return 1
    
    
def apriori_model(country = country, ID = ID):
    data = df[df['Country'] == country]
    today_date = max(data["Date"])
    #RFM
    rfm = data.groupby('CustomerID').agg({'Date': lambda Date: (today_date - Date.max()).days,
                                     'CustomerID': lambda CustomerID: CustomerID.count(),
                                     'Total_Price': lambda Total_Price: Total_Price.sum()})
    rfm.columns = ["recency", "frequency", "monetary"]
    scaler = StandardScaler().fit(rfm)
    rfm_scale = scaler.transform(rfm)
    #Kmeans
    kmeans = KMeans(n_clusters = 4, n_init=25, max_iter=300)
    k_means = kmeans.fit(rfm_scale)
    segment = k_means.labels_
    rfm['segment'] = segment

    rfm = rfm.reset_index().rename(columns={'index': 'CustomerID'})
    new_df = data.merge(rfm, right_on = 'CustomerID', left_on = 'CustomerID')

    #Apriori

    number_of_cluster = list(rfm[rfm['CustomerID'] == ID]['segment'])[0]

    apriori_df = new_df[new_df['segment'] == number_of_cluster ]
    basket = (apriori_df.groupby(['BillNo', 'Itemname'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('BillNo'))
    # Encoding the datasets
    basket_encoded = basket.applymap(hot_encode)
    basket = basket_encoded

    frq_items = apriori(basket, min_support = 0.03, use_colnames = True)
    rules = association_rules(frq_items, metric ="lift", min_threshold = 0.8)
    rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
    return rules


rules = apriori_model(country=country, ID=ID)
rules.head()

 Write the country of the customer:  France
 Write the customer's ID number:  12680


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
61,(CHILDS BREAKFAST SET DOLLY GIRL),(CHILDS BREAKFAST SET SPACEBOY),0.035971,0.043165,0.035971,1.0,23.166667,0.034419,inf
41,(CARD DOLLY GIRL),(SPACEBOY BIRTHDAY CARD),0.043165,0.057554,0.043165,1.0,17.375,0.040681,inf
280,"(POSTAGE, CARD DOLLY GIRL)",(SPACEBOY BIRTHDAY CARD),0.043165,0.057554,0.043165,1.0,17.375,0.040681,inf
283,(CARD DOLLY GIRL),"(SPACEBOY BIRTHDAY CARD, POSTAGE)",0.043165,0.057554,0.043165,1.0,17.375,0.040681,inf
256,"(ALARM CLOCK BAKELIKE PINK, ALARM CLOCK BAKELI...",(ALARM CLOCK BAKELIKE RED),0.035971,0.064748,0.035971,1.0,15.444444,0.033642,inf
