#### K-means 알고리즘

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv")

In [7]:
df['species'] = LabelEncoder().fit_transform(df['species'])
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

In [8]:
cluster1 = KMeans(n_clusters=3, n_init=10, max_iter=500, random_state=42)
cluster1.fit(X)

In [9]:
cluster_center = cluster1.cluster_centers_
cluster_prediction = cluster1.predict(X)
print(pd.DataFrame(cluster_center))
print(cluster_prediction)

          0         1         2         3
0  5.901613  2.748387  4.393548  1.433871
1  5.006000  3.428000  1.462000  0.246000
2  6.850000  3.073684  5.742105  2.071053
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [10]:
df['cluster'] = cluster_prediction
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,cluster
0,5.1,3.5,1.4,0.2,0,1
1,4.9,3.0,1.4,0.2,0,1
2,4.7,3.2,1.3,0.2,0,1
3,4.6,3.1,1.5,0.2,0,1
4,5.0,3.6,1.4,0.2,0,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,2
146,6.3,2.5,5.0,1.9,2,0
147,6.5,3.0,5.2,2.0,2,2
148,6.2,3.4,5.4,2.3,2,2


In [11]:
scope = range(1, 10)
inertias = []

for k in scope:
    model = KMeans(n_clusters=k, n_init=10, max_iter=500, random_state=42)
    model.fit(X)
    inertias.append(model.inertia_)
    print(k, inertias[k-1])

1 681.3706
2 152.3479517603579
3 78.85144142614601
4 57.228473214285714
5 46.461172672672674
6 39.03998724608725
7 34.30581529581531
8 30.132440554614476
9 28.29063524195103


#### 연관분석

In [13]:
from mlxtend.frequent_patterns import apriori, association_rules

In [15]:
df = pd.read_csv("https://raw.githubusercontent.com/YoungjinBD/dataset/main/retail_dataset.csv", sep=',')

In [22]:
items = set()
for col in df:
    items.update(df[col].unique())

itemset = set(items)
encoding = []
for index, row in df.iterrows():
    rowset = set(row)
    labels = {}
    dismatching = list(itemset - rowset)
    matching = list(itemset.intersection(rowset))
    for i in dismatching:
        labels[i] = 0
    for j in matching:
        labels[j] = 1
    encoding.append(labels)

result = pd.DataFrame(encoding)
result


Unnamed: 0,Milk,Bagel,NaN,Eggs,Meat,Wine,Cheese,Diaper,Bread,Pencil
0,0,0,0,1,1,1,1,1,1,1
1,1,0,0,0,1,1,1,1,1,1
2,1,0,1,1,1,1,1,0,0,0
3,1,0,1,1,1,1,1,0,0,0
4,0,0,1,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
310,0,0,1,1,0,0,1,0,1,0
311,1,0,1,0,1,0,0,0,0,1
312,0,0,0,1,1,1,1,1,1,1
313,0,0,1,0,1,0,1,0,0,0


In [32]:
result.drop(result.columns[2], axis=1, inplace=True)

In [34]:
freq_items = apriori(result, min_support=0.2, use_colnames=True)



In [36]:
rules = association_rules(freq_items, metric='confidence', min_threshold=0.6)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
1,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
2,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
3,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
4,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
