In [7]:
# mining_iris_basket.py
# Task 3: Classification and Association Rule Mining (No mlxtend install needed)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
import random
from itertools import combinations

# ================================
# Load Preprocessed Iris Data
# ================================
df = pd.read_csv(
    r"C:\Users\Ayan\Documents\DSA 2040_Practical_Exam_Ilham_152\data_mining\task1_preprocessing\iris_preprocessed.csv"
)
X = df.drop(columns=['species'])
y = df['species']  # Keep original target labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ================================
# Part A: Classification
# ================================
print("\n=== Decision Tree Classifier ===")
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt, average='weighted'))
print("Recall:", recall_score(y_test, y_pred_dt, average='weighted'))
print("F1-score:", f1_score(y_test, y_pred_dt, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

plt.figure(figsize=(12, 8))
plot_tree(dt, feature_names=X.columns, class_names=y.unique(), filled=True)
plt.title("Decision Tree Visualization")
plt.savefig("decision_tree.png")
plt.close()

# KNN Classifier
print("\n=== KNN Classifier (k=5) ===")
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn, average='weighted'))
print("Recall:", recall_score(y_test, y_pred_knn, average='weighted'))
print("F1-score:", f1_score(y_test, y_pred_knn, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))

# Model comparison
if accuracy_score(y_test, y_pred_dt) > accuracy_score(y_test, y_pred_knn):
    print("Decision Tree performed better.")
else:
    print("KNN performed better.")

# ================================
# Part B: Association Rule Mining (Pure pandas, no mlxtend)
# ================================
print("\n=== Association Rule Mining ===")

# Generate synthetic basket data
items_pool = [
    'milk', 'bread', 'beer', 'diapers', 'eggs', 'butter', 'cheese', 'apple', 'banana', 'chicken',
    'rice', 'pasta', 'tomato', 'onion', 'carrot', 'yogurt', 'juice', 'coffee', 'tea', 'sugar'
]
transactions = [random.sample(items_pool, random.randint(3, 8)) for _ in range(30)]
df_basket = pd.DataFrame(transactions, columns=[f'item{i}' for i in range(1, 9)])
print("\nSample Transactions:\n", df_basket.head())

# One-hot encoding
all_items = sorted(set(item for basket in transactions for item in basket))
onehot_df = pd.DataFrame(0, index=range(len(transactions)), columns=all_items)
for i, basket in enumerate(transactions):
    for item in basket:
        onehot_df.loc[i, item] = 1

# Function to compute support
def support(itemset):
    return onehot_df[list(itemset)].all(axis=1).mean()

# Generate frequent itemsets
frequent_itemsets = []
min_support = 0.2
for length in range(1, 4):
    for combo in combinations(all_items, length):
        s = support(combo)
        if s >= min_support:
            frequent_itemsets.append((combo, s))

frequent_itemsets_df = pd.DataFrame(frequent_itemsets, columns=["itemset", "support"])

# Generate rules
rules_list = []
min_confidence = 0.5
for itemset, supp in frequent_itemsets:
    if len(itemset) >= 2:
        for i in range(1, len(itemset)):
            for antecedent in combinations(itemset, i):
                consequent = tuple(set(itemset) - set(antecedent))
                conf = support(itemset) / support(antecedent)
                lift = conf / support(consequent)
                if conf >= min_confidence:
                    rules_list.append((antecedent, consequent, supp, conf, lift))

rules_df = pd.DataFrame(rules_list, columns=["antecedent", "consequent", "support", "confidence", "lift"])
rules_df = rules_df.sort_values(by="lift", ascending=False)

print("\nTop 5 Rules by Lift:\n", rules_df.head())

# Rule analysis example
if not rules_df.empty:
    example_rule = rules_df.iloc[0]
    print(f"\nAnalysis of Rule: If a customer buys {list(example_rule['antecedent'])}, "
          f"they are likely to also buy {list(example_rule['consequent'])} "
          f"with a confidence of {example_rule['confidence']:.2f} and lift {example_rule['lift']:.2f}.")




=== Decision Tree Classifier ===
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


=== KNN Classifier (k=5) ===
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

KNN performed 