<a href="https://colab.research.google.com/github/Mo-null/Data-mining-I/blob/main/Copy_of_Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.datasets import load_wine, load_breast_cancer

wine = load_wine()
cancer = load_breast_cancer()

df1 = pd.DataFrame(wine.data, columns=wine.feature_names)
df1['target'] = wine.target

df2 = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df2['target'] = cancer.target

print(" Classification datasets loaded successfully")
print(f"Dataset 1 (Wine): {df1.shape}")
print(f"Dataset 2 (Cancer): {df2.shape}")

 Classification datasets loaded successfully
Dataset 1 (Wine): (178, 14)
Dataset 2 (Cancer): (569, 31)


In [None]:

def preprocess_data(df):
    X = df.drop('target', axis=1)
    y = df['target']

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    from sklearn.preprocessing import MinMaxScaler
    normalizer = MinMaxScaler()
    X_normalized = normalizer.fit_transform(X)

    X_discrete = pd.DataFrame(X_scaled).apply(lambda x: pd.cut(x, bins=5, labels=False))

    print(f"Pre-processing applied: Standardization, Normalization, Discretization")
    return X_scaled, X_normalized, X_discrete, y

X1_scaled, X1_norm, X1_disc, y1 = preprocess_data(df1)
X2_scaled, X2_norm, X2_disc, y2 = preprocess_data(df2)

Pre-processing applied: Standardization, Normalization, Discretization
Pre-processing applied: Standardization, Normalization, Discretization


In [None]:

classifiers = {
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

print(" Classifiers initialized successfully")

 Classifiers initialized successfully


In [None]:

from sklearn.base import clone

def evaluate_classifier(clf, X_train, X_test, y_train, y_test):
    clf_copy = clone(clf)
    clf_copy.fit(X_train, y_train)
    y_pred = clf_copy.predict(X_test)

    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'F1': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }
print(" Evaluation function ready")

 Evaluation function ready


In [None]:

def run_holdout_80_20(X, y, dataset_name):
    print(f"\n=== {dataset_name} - HOLDOUT 80/20 ===")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results = {}
    for name, clf in classifiers.items():
        results[name] = evaluate_classifier(clf, X_train, X_test, y_train, y_test)
        print(f"{name}: {results[name]}")

    return results

results1_80 = run_holdout_80_20(X1_scaled, y1, "WINE DATASET")
results2_80 = run_holdout_80_20(X2_scaled, y2, "CANCER DATASET")


=== WINE DATASET - HOLDOUT 80/20 ===
Naive Bayes: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0}
K-Nearest Neighbors: {'Accuracy': 0.9444444444444444, 'Precision': 0.9493827160493827, 'Recall': 0.9444444444444444, 'F1': 0.9436036129748098}
Decision Tree: {'Accuracy': 0.9444444444444444, 'Precision': 0.9462962962962962, 'Recall': 0.9444444444444444, 'F1': 0.9439974457215836}

=== CANCER DATASET - HOLDOUT 80/20 ===
Naive Bayes: {'Accuracy': 0.9649122807017544, 'Precision': 0.9652053622194477, 'Recall': 0.9649122807017544, 'F1': 0.9647382344750765}
K-Nearest Neighbors: {'Accuracy': 0.9473684210526315, 'Precision': 0.9473684210526315, 'Recall': 0.9473684210526315, 'F1': 0.9473684210526315}
Decision Tree: {'Accuracy': 0.9473684210526315, 'Precision': 0.9473684210526315, 'Recall': 0.9473684210526315, 'F1': 0.9473684210526315}


In [None]:

def run_holdout_66_33(X, y, dataset_name):
    print(f"\n=== {dataset_name} - HOLDOUT 66.6/33.3 ===")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.333, random_state=42)

    results = {}
    for name, clf in classifiers.items():
        results[name] = evaluate_classifier(clf, X_train, X_test, y_train, y_test)
        print(f"{name}: {results[name]}")

    return results

results1_66 = run_holdout_66_33(X1_scaled, y1, "WINE DATASET")
results2_66 = run_holdout_66_33(X2_scaled, y2, "CANCER DATASET")


=== WINE DATASET - HOLDOUT 66.6/33.3 ===
Naive Bayes: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0}
K-Nearest Neighbors: {'Accuracy': 0.9666666666666667, 'Precision': 0.9695652173913043, 'Recall': 0.9666666666666667, 'F1': 0.966699604743083}
Decision Tree: {'Accuracy': 0.9333333333333333, 'Precision': 0.9371345029239766, 'Recall': 0.9333333333333333, 'F1': 0.9328498985801217}

=== CANCER DATASET - HOLDOUT 66.6/33.3 ===
Naive Bayes: {'Accuracy': 0.9368421052631579, 'Precision': 0.9368421052631579, 'Recall': 0.9368421052631579, 'F1': 0.9368421052631579}
K-Nearest Neighbors: {'Accuracy': 0.9578947368421052, 'Precision': 0.9578947368421052, 'Recall': 0.9578947368421052, 'F1': 0.9578947368421052}
Decision Tree: {'Accuracy': 0.9157894736842105, 'Precision': 0.9165288220551379, 'Recall': 0.9157894736842105, 'F1': 0.9160517175079272}


In [None]:

def run_cv_10_fold(X, y, dataset_name):
    print(f"\n=== {dataset_name} - 10-FOLD CV ===")

    results = {}
    for name, clf in classifiers.items():

        accuracy_scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
        precision_scores = cross_val_score(clf, X, y, cv=10, scoring='precision_weighted')
        recall_scores = cross_val_score(clf, X, y, cv=10, scoring='recall_weighted')
        f1_scores = cross_val_score(clf, X, y, cv=10, scoring='f1_weighted')

        results[name] = {
            'Accuracy': f"{accuracy_scores.mean():.3f}",
            'Precision': f"{precision_scores.mean():.3f}",
            'Recall': f"{recall_scores.mean():.3f}",
            'F1': f"{f1_scores.mean():.3f}"
        }
        print(f"{name}: Acc={results[name]['Accuracy']}, Prec={results[name]['Precision']}, Rec={results[name]['Recall']}, F1={results[name]['F1']}")

    return results

cv10_1 = run_cv_10_fold(X1_scaled, y1, "WINE DATASET")
cv10_2 = run_cv_10_fold(X2_scaled, y2, "CANCER DATASET")


=== WINE DATASET - 10-FOLD CV ===
Naive Bayes: Acc=0.978, Prec=0.981, Rec=0.978, F1=0.978
K-Nearest Neighbors: Acc=0.960, Prec=0.967, Rec=0.960, F1=0.961
Decision Tree: Acc=0.865, Prec=0.889, Rec=0.865, F1=0.858

=== CANCER DATASET - 10-FOLD CV ===
Naive Bayes: Acc=0.932, Prec=0.934, Rec=0.932, F1=0.931
K-Nearest Neighbors: Acc=0.967, Prec=0.968, Rec=0.967, F1=0.966
Decision Tree: Acc=0.928, Prec=0.930, Rec=0.928, F1=0.928


In [None]:

def run_cv_5_fold(X, y, dataset_name):
    print(f"\n=== {dataset_name} - 5-FOLD CV ===")

    results = {}
    for name, clf in classifiers.items():
        accuracy_scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
        precision_scores = cross_val_score(clf, X, y, cv=5, scoring='precision_weighted')
        recall_scores = cross_val_score(clf, X, y, cv=5, scoring='recall_weighted')
        f1_scores = cross_val_score(clf, X, y, cv=5, scoring='f1_weighted')

        results[name] = {
            'Accuracy': f"{accuracy_scores.mean():.3f}",
            'Precision': f"{precision_scores.mean():.3f}",
            'Recall': f"{recall_scores.mean():.3f}",
            'F1': f"{f1_scores.mean():.3f}"
        }
        print(f"{name}: Acc={results[name]['Accuracy']}, Prec={results[name]['Precision']}, Rec={results[name]['Recall']}, F1={results[name]['F1']}")

    return results

cv5_1 = run_cv_5_fold(X1_scaled, y1, "WINE DATASET")
cv5_2 = run_cv_5_fold(X2_scaled, y2, "CANCER DATASET")


=== WINE DATASET - 5-FOLD CV ===
Naive Bayes: Acc=0.966, Prec=0.970, Rec=0.966, F1=0.966
K-Nearest Neighbors: Acc=0.955, Prec=0.960, Rec=0.955, F1=0.954
Decision Tree: Acc=0.871, Prec=0.888, Rec=0.871, F1=0.869

=== CANCER DATASET - 5-FOLD CV ===
Naive Bayes: Acc=0.928, Prec=0.929, Rec=0.928, F1=0.928
K-Nearest Neighbors: Acc=0.965, Prec=0.966, Rec=0.965, F1=0.965
Decision Tree: Acc=0.917, Prec=0.921, Rec=0.917, F1=0.918
