Mengimport library yang dibutuhkan dan menautkan Google Drive

In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Membuka dan memproses dataset

In [2]:
data = pd.read_excel('/content/drive/My Drive/DatasetBDA/SampleSuperstores.xlsx')
data.head()

Unnamed: 0,Postal Code,City,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
0,42420,Henderson,South,Furniture,Bookcases,261.96,2,0,419136
1,42420,Henderson,South,Furniture,Chairs,731.94,3,0,219582
2,90036,Los Angeles,West,Office Supplies,Labels,0.626389,2,0,68714
3,33311,Fort Lauderdale,South,Furniture,Tables,9575775.0,5,00:45:00,-383031
4,33311,Fort Lauderdale,South,Office Supplies,Storage,22368.0,2,00:02:00,25164


Mengeksplorasi kolom pada dataset

In [3]:
data.columns

Index(['Postal Code', 'City', 'Region', 'Category', 'Sub-Category', 'Sales',
       'Quantity', 'Discount', 'Profit'],
      dtype='object')

Mengeksplorasi kota yang unik di setiap transaksinya

In [4]:
data.City.unique()

array(['Henderson', 'Los Angeles', 'Fort Lauderdale', 'Concord',
       'Seattle', 'Fort Worth', 'Madison', 'West Jordan', 'San Francisco',
       'Fremont', 'Philadelphia', 'Orem', 'Houston', 'Richardson',
       'Naperville', 'Melbourne', 'Eagan', 'Westland', 'Dover',
       'New Albany', 'New York City', 'Troy', 'Chicago', 'Gilbert',
       'Springfield', 'Jackson', 'Memphis', 'Decatur', 'Durham',
       'Columbia', 'Rochester', 'Minneapolis', 'Portland', 'Saint Paul',
       'Aurora', 'Charlotte', 'Orland Park', 'Urbandale', 'Columbus',
       'Bristol', 'Wilmington', 'Bloomington', 'Phoenix', 'Roseville',
       'Independence', 'Pasadena', 'Newark', 'Franklin', 'Scottsdale',
       'San Jose', 'Edmond', 'Carlsbad', 'San Antonio', 'Monroe',
       'Fairfield', 'Grand Prairie', 'Redlands', 'Hamilton', 'Westfield',
       'Akron', 'Denver', 'Dallas', 'Whittier', 'Saginaw', 'Medina',
       'Dublin', 'Detroit', 'Tampa', 'Santa Clara', 'Lakeville',
       'San Diego', 'Brentwood', 'Cha

Data cleaning

In [7]:
# Menghapus spasi tambahan pada kategori dan sub-kategori
data['Category'] = data['Category'].str.strip()
data['Sub-Category'] = data['Sub-Category'].str.strip()
  
# Menghapus baris yang tidak terdapat nomor invoice
data.dropna(axis = 0, subset =['Postal Code'], inplace = True)
data['Postal Code'] = data['Postal Code'].astype('str')

Splitting data berdasarkan kota asal transaksi *diambil contoh 4 kota



In [8]:
# Transaksi yang dilakukan di Los Angeles
basket_LA = (data[data['City'] =="Los Angeles"]
		.groupby(['Postal Code', 'Category'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('Postal Code'))

# Transaksi yang dilakukan di Fort Lauderdale
basket_FL = (data[data['City'] =="Fort Lauderdale"]
		.groupby(['Postal Code', 'Category'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('Postal Code'))

# Transaksi yang dilakukan di New York City
basket_NYC = (data[data['City'] =="New York City"]
		.groupby(['Postal Code', 'Category'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('Postal Code'))

# Transaksi yang dilakukan di San Francisco
basket_SF = (data[data['City'] =="San Francisco"]
		.groupby(['Postal Code', 'Category'])['Quantity']
		.sum().unstack().reset_index().fillna(0)
		.set_index('Postal Code'))


Data encoding

In [9]:
# Membuat hot encode untuk menyesuaikan dengan library yang digunakan
def hot_encode(x):
	if(x<= 0):
		return 0
	if(x>= 1):
		return 1

# Encoding dataset
basket_encoded = basket_LA.applymap(hot_encode)
basket_LA = basket_encoded

basket_encoded = basket_FL.applymap(hot_encode)
basket_FL = basket_encoded

basket_encoded = basket_NYC.applymap(hot_encode)
basket_NYC = basket_encoded

basket_encoded = basket_SF.applymap(hot_encode)
basket_SF = basket_encoded

Membuat model dan analisa hasil

Los Angeles

In [10]:
# Bentuk model
frq_items = apriori(basket_LA, min_support = 0.05, use_colnames = True)
  
# Mengumpulkan data pada sebuah dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

                     antecedents                    consequents  \
3                    (Furniture)                   (Technology)   
8   (Office Supplies, Furniture)                   (Technology)   
11                   (Furniture)  (Office Supplies, Technology)   
1                    (Furniture)              (Office Supplies)   
4                   (Technology)              (Office Supplies)   

    antecedent support  consequent support  support  confidence  lift  \
3                  0.4                 0.8      0.4         1.0  1.25   
8                  0.4                 0.8      0.4         1.0  1.25   
11                 0.4                 0.8      0.4         1.0  1.25   
1                  0.4                 1.0      0.4         1.0  1.00   
4                  0.8                 1.0      0.8         1.0  1.00   

    leverage  conviction  
3       0.08         inf  
8       0.08         inf  
11      0.08         inf  
1       0.00         inf  
4       0.00         in

Fort Lauderdale

In [11]:
# Bentuk model
frq_items = apriori(basket_FL, min_support = 0.05, use_colnames = True)
  
# Mengumpulkan data pada sebuah dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

         antecedents        consequents  antecedent support  \
0  (Office Supplies)        (Furniture)                 1.0   
1        (Furniture)  (Office Supplies)                 1.0   

   consequent support  support  confidence  lift  leverage  conviction  
0                 1.0      1.0         1.0   1.0       0.0         inf  
1                 1.0      1.0         1.0   1.0       0.0         inf  


New York City

In [12]:
# Bentuk model
frq_items = apriori(basket_NYC, min_support = 0.05, use_colnames = True)
  
# Mengumpulkan data pada sebuah dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

               antecedents        consequents  antecedent support  \
1              (Furniture)  (Office Supplies)            0.666667   
3              (Furniture)       (Technology)            0.666667   
4             (Technology)  (Office Supplies)            1.000000   
5        (Office Supplies)       (Technology)            1.000000   
7  (Technology, Furniture)  (Office Supplies)            0.666667   

   consequent support   support  confidence  lift  leverage  conviction  
1                 1.0  0.666667         1.0   1.0       0.0         inf  
3                 1.0  0.666667         1.0   1.0       0.0         inf  
4                 1.0  1.000000         1.0   1.0       0.0         inf  
5                 1.0  1.000000         1.0   1.0       0.0         inf  
7                 1.0  0.666667         1.0   1.0       0.0         inf  


San Francisco

In [13]:
# Bentuk model
frq_items = apriori(basket_SF, min_support = 0.05, use_colnames = True)
  
# Mengumpulkan data pada sebuah dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head())

                     antecedents                   consequents  \
2                   (Technology)                   (Furniture)   
3                    (Furniture)                  (Technology)   
6  (Office Supplies, Technology)                   (Furniture)   
8   (Office Supplies, Furniture)                  (Technology)   
9                   (Technology)  (Office Supplies, Furniture)   

   antecedent support  consequent support   support  confidence  lift  \
2            0.333333            0.333333  0.333333         1.0   3.0   
3            0.333333            0.333333  0.333333         1.0   3.0   
6            0.333333            0.333333  0.333333         1.0   3.0   
8            0.333333            0.333333  0.333333         1.0   3.0   
9            0.333333            0.333333  0.333333         1.0   3.0   

   leverage  conviction  
2  0.222222         inf  
3  0.222222         inf  
6  0.222222         inf  
8  0.222222         inf  
9  0.222222         inf  


Setelah menganalisis keempat contoh model diatas, dapat disimpulkan bahwa pada keempat kota diatas, jenis barang yang sering dibeli adalah kebutuhan kantor, furnitur, dan teknologi