# Data Mining with Python - Chapter 1


This notebook contains alternative examples for Chapter 1 of the book **Learning Data Mining with Python**.
The chapter introduces basic data mining concepts, using Python for implementation.


## Example 1: Affinity Analysis

In [None]:

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Sample data: transactions with different values
data = {
    'X': [1, 0, 1, 1, 0],
    'Y': [1, 1, 0, 0, 1],
    'Z': [0, 1, 0, 1, 1],
}

df = pd.DataFrame(data)

# Apply the Apriori algorithm
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

# Display results
print("Frequent Itemsets:")
print(frequent_itemsets)
print("\nAssociation Rules:")
print(rules)


## Example 2: OneR Algorithm

In [None]:

import numpy as np
from sklearn.model_selection import train_test_split

def one_r_train(X, y):
    num_features = X.shape[1]
    best_feature = -1
    best_error = float('inf')
    best_predictors = None

    for feature_index in range(num_features):
        feature_values = np.unique(X[:, feature_index])
        predictors = {}
        total_error = 0

        for value in feature_values:
            class_counts = np.bincount(y[X[:, feature_index] == value])
            most_frequent_class = np.argmax(class_counts)
            error = np.sum(y[X[:, feature_index] == value] != most_frequent_class)
            predictors[value] = most_frequent_class
            total_error += error

        if total_error < best_error:
            best_error = total_error
            best_feature = feature_index
            best_predictors = predictors

    return best_feature, best_predictors

# Sample data
X = np.array([[1, 0], [0, 1], [1, 1], [0, 0], [1, 0]])
y = np.array([0, 1, 1, 0, 1])

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the OneR model
best_feature, predictors = one_r_train(X_train, y_train)

# Predict function
def predict(X_test, predictors, feature_index):
    return np.array([predictors.get(sample[feature_index], -1) for sample in X_test])

# Get predictions
y_pred = predict(X_test, predictors, best_feature)

# Print accuracy
accuracy = np.mean(y_pred == y_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))


## Example 3: Training and Testing Workflow

In [None]:

from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Classifier Accuracy: {accuracy:.2f}")


## Example 4: Accuracy Calculation

In [None]:

from sklearn.metrics import accuracy_score

# Since we have new accuracy results from Example 3
# Calculate accuracy of the Random Forest model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Classifier Accuracy (Validation): {accuracy:.2f}")


## Summary
This notebook demonstrated alternative data mining techniques from Chapter 1, including affinity analysis with different metrics, a revised OneR algorithm implementation, a Random Forest classifier for training and testing, and accuracy validation. All examples were implemented and evaluated using Python code in Jupyter Notebook.