<a href="https://colab.research.google.com/github/Harshininew/first/blob/main/Market_Basket_Analysis_with_Apriori.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)

df = pd.read_excel("/content/Assignment-2_Data.xlsx")

  and should_run_async(code)


In [5]:
# We are trying to understand the data.

def check_df(dataframe, head=5):
    print("################### Shape ####################")
    print(dataframe.shape)
    print("#################### Info #####################")
    print(dataframe.info())
    print("################### Nunique ###################")
    print(dataframe.nunique())
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("################## Quantiles #################")
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
    print("#################### Head ####################")
    print(dataframe.head(head))

check_df(df)

  and should_run_async(code)


################### Shape ####################
(522064, 7)
#################### Info #####################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522064 entries, 0 to 522063
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   BillNo      522064 non-null  object        
 1   Itemname    520609 non-null  object        
 2   Quantity    522064 non-null  int64         
 3   Date        522064 non-null  datetime64[ns]
 4   Price       522064 non-null  float64       
 5   CustomerID  388023 non-null  float64       
 6   Country     522064 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 27.9+ MB
None
################### Nunique ###################
BillNo        21663
Itemname       4185
Quantity        690
Date          19641
Price          1285
CustomerID     4297
Country          30
dtype: int64
##################### NA #####################
BillNo 

In [8]:
# Data Preparation

# We set a small threshold value to account for the presence of outliers in the data.
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

# We are writing a function to equalize the outlier values in the data to threshold values.
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

# We are removing the negative and outlier values from the quantity and price variables.
def retail_data_prep(dataframe):
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe = dataframe[dataframe["Price"] > 0]
    replace_with_thresholds(dataframe, "Quantity")
    replace_with_thresholds(dataframe, "Price")
    return dataframe

df = retail_data_prep(df)

df.describe().T

  and should_run_async(code)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,519551.0,9.39742,21.281261,1.0,1.0,3.0,10.0,248.5
Price,519551.0,3.32647,3.87738,0.001,1.25,2.08,4.13,41.94
CustomerID,387985.0,15317.042994,1721.813298,12346.0,13950.0,15265.0,16837.0,18287.0


In [9]:
# We are selecting only the data for France in order to narrow down the data since it is large.
df_fr = df[df['Country'] == "France"]

# We are creating a table based on the sum of Quantity for the breakdown of BillNo and Itemname.
df_fr.groupby(['BillNo', 'Itemname']).agg({"Quantity": "sum"}).unstack().fillna(0).iloc[0:5, 0:5]

  and should_run_async(code)


Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity,Quantity
Itemname,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 EGG HOUSE PAINTED WOOD,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND
BillNo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
536370,0.0,0.0,0.0,0.0,0.0
536852,0.0,0.0,0.0,0.0,0.0
536974,0.0,0.0,0.0,0.0,0.0
537065,0.0,0.0,0.0,0.0,0.0
537463,0.0,0.0,0.0,0.0,0.0


In [10]:
# We are converting the table to a completely Boolean type.
fr_inv_pro_df=df_fr.groupby(['BillNo', 'Itemname']). \
                agg({"Quantity": "sum"}). \
                unstack(). \
                fillna(0). \
                applymap(lambda x: 1 if x > 0 else 0)


# We are using the Apriori method to find the support values of the products.
frequent_itemsets = apriori(fr_inv_pro_df.astype("bool"),
                            min_support=0.01,
                            use_colnames=True)


frequent_itemsets.sort_values("support", ascending=False).head()

  and should_run_async(code)


Unnamed: 0,support,itemsets
330,0.765306,"((Quantity, POSTAGE))"
332,0.188776,"((Quantity, RABBIT NIGHT LIGHT))"
371,0.181122,"((Quantity, RED TOADSTOOL LED NIGHT LIGHT))"
320,0.170918,"((Quantity, PLASTERS IN TIN WOODLAND ANIMALS))"
315,0.168367,"((Quantity, PLASTERS IN TIN CIRCUS PARADE))"


In [11]:
# With this method, we can obtain the support, confidence, and lift values of the products,
# with the support values that we input.
rules = association_rules(frequent_itemsets,
                          metric="support",
                          min_threshold=0.01)


# By setting a threshold value for the metrics obtained, we can see the product associations in Apriori algorithm.
rules[(rules["support"]>0.05) & (rules["confidence"]>0.1) & (rules["lift"]>5)]. \
sort_values("confidence", ascending=False).head(10)


# Thus, we can see the products that are closely related to each other.

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
67536,"((Quantity, SET/20 RED RETROSPOT PAPER NAPKINS...","((Quantity, SET/6 RED SPOTTY PAPER CUPS))",0.102041,0.137755,0.09949,0.975,7.077778,0.085433,34.489796,0.956294
67534,"((Quantity, SET/6 RED SPOTTY PAPER CUPS), (Qua...","((Quantity, SET/6 RED SPOTTY PAPER PLATES))",0.102041,0.127551,0.09949,0.975,7.644,0.086474,34.897959,0.967949
210981,"((Quantity, SET/6 RED SPOTTY PAPER CUPS), (Qua...","((Quantity, SET/6 RED SPOTTY PAPER PLATES))",0.084184,0.127551,0.081633,0.969697,7.602424,0.070895,28.790816,0.948294
210983,"((Quantity, SET/20 RED RETROSPOT PAPER NAPKINS...","((Quantity, SET/6 RED SPOTTY PAPER CUPS))",0.084184,0.137755,0.081633,0.969697,7.039282,0.070036,28.454082,0.936804
8963,"((Quantity, SET/6 RED SPOTTY PAPER PLATES))","((Quantity, SET/6 RED SPOTTY PAPER CUPS))",0.127551,0.137755,0.122449,0.96,6.968889,0.104878,21.556122,0.981725
65220,"((Quantity, POSTAGE), (Quantity, SET/6 RED SPO...","((Quantity, SET/6 RED SPOTTY PAPER CUPS))",0.107143,0.137755,0.102041,0.952381,6.91358,0.087281,18.107143,0.958
2368,"((Quantity, CHILDRENS CUTLERY SPACEBOY))","((Quantity, CHILDRENS CUTLERY DOLLY GIRL))",0.068878,0.071429,0.063776,0.925926,12.962963,0.058856,12.535714,0.991123
74385,"((Quantity, POSTAGE), (Quantity, ALARM CLOCK B...","((Quantity, ALARM CLOCK BAKELIKE RED))",0.061224,0.094388,0.056122,0.916667,9.711712,0.050344,10.867347,0.955534
6293,"((Quantity, PACK OF 6 SKULL PAPER PLATES))","((Quantity, PACK OF 6 SKULL PAPER CUPS))",0.056122,0.063776,0.05102,0.909091,14.254545,0.047441,10.298469,0.985135
2369,"((Quantity, CHILDRENS CUTLERY DOLLY GIRL))","((Quantity, CHILDRENS CUTLERY SPACEBOY))",0.071429,0.068878,0.063776,0.892857,12.962963,0.058856,8.690476,0.993846


In [12]:
# Providing product recommendations to users at the shopping cart stage.

def arl_recommender(rules_df, product_name, rec_count=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_name:
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])

    return recommendation_list[0:rec_count]

# When we sort the items in the cart by Lift,
# we can select the desired product and see the desired number of other products that are most closely related to it.
# For example; 'PLASTERS IN TIN CIRCUS PARADE'
arl_recommender(rules, ('Quantity', 'PLASTERS IN TIN CIRCUS PARADE'), 3)

  and should_run_async(code)


[('Quantity', 'LUNCH BOX WITH CUTLERY RETROSPOT'),
 ('Quantity', 'LUNCH BOX WITH CUTLERY RETROSPOT'),
 ('Quantity', 'PLASTERS IN TIN SPACEBOY')]