<a href="https://colab.research.google.com/github/KarandeepSinghBedi/Applied_Machine_Learning/blob/main/AML_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Function to load datasets

In [6]:
def load_datasets():
    # Load Banknote Authentication dataset
    banknote_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"
    banknote_cols = ['variance', 'skewness', 'curtosis', 'entropy', 'class']
    try:
        banknote_df = pd.read_csv(banknote_url, header=None, names=banknote_cols)
    except:
        print("Could not load from URL. Loading from backup list.")
        # Backup data loading with sample data
        banknote_data = [
            [3.6216,8.6661,-2.8073,-0.44699,0],
            [4.5459,8.1674,-2.4586,-1.4621,0],
            [3.866,-2.6383,1.9242,0.10645,0],
            [3.4566,9.5228,-4.0112,-3.5944,0],
            [0.32924,-4.4552,4.5718,-0.9888,0],
            [-1.3971,3.2689,-4.9101,-1.5325,1],
            [-1.6677,2.8399,-4.8663,-1.4192,1],
            [-2.2588,4.3468,-6.2708,-1.9578,1],
            [-2.7338,3.0997,-4.3117,-1.9329,1],
            [-2.5439,3.5906,-4.6242,-1.9535,1]
        ]
        banknote_df = pd.DataFrame(banknote_data, columns=banknote_cols)

    # Load Haberman's Survival dataset
    haberman_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data"
    haberman_cols = ['age', 'operation_year', 'positive_nodes', 'survival_status']
    try:
        haberman_df = pd.read_csv(haberman_url, header=None, names=haberman_cols)
    except:
        print("Could not load from URL. Loading from backup list.")
        # Backup data loading with sample data
        haberman_data = [
            [30, 64, 1, 1],
            [30, 62, 3, 1],
            [30, 65, 0, 1],
            [31, 59, 2, 1],
            [31, 65, 4, 1],
            [33, 58, 10, 1],
            [33, 60, 0, 1],
            [34, 59, 0, 2],
            [34, 66, 9, 2],
            [38, 69, 21, 2]
        ]
        haberman_df = pd.DataFrame(haberman_data, columns=haberman_cols)

    # Converting survival status to 0 and 1 in Haberman dataset (if needed)
    if haberman_df['survival_status'].max() == 2:
        haberman_df['survival_status'] = haberman_df['survival_status'].map({1: 0, 2: 1})

    return banknote_df, haberman_df

# Function to explore datasets

In [7]:
def explore_dataset(df, dataset_name):
    print(f"\n{'='*50}")
    print(f"Dataset: {dataset_name}")
    print(f"{'='*50}")

    print("\nFirst 5 rows:")
    print(df.head())

    print("\nDataset shape:", df.shape)
    print("\nData types:")
    print(df.dtypes)

    print("\nBasic statistics:")
    print(df.describe())

    print("\nClass distribution:")
    target_col = df.columns[-1]
    print(df[target_col].value_counts())
    print(f"Class balance: {df[target_col].value_counts(normalize=True).round(3)}")

    # Check for missing values
    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print("\nMissing values:")
        print(missing_values[missing_values > 0])
    else:
        print("\nNo missing values found.")

# Function to preprocess datasets

In [8]:
def preprocess_dataset(df):
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

# Function to train and evaluate models

In [16]:
def train_evaluate_models(X_train, X_test, y_train, y_test, dataset_name):
    results = {}

    # Define models with hyperparameter grids
    models = {
        'Naive Bayes': {
            'model': GaussianNB(),
            'params': {}  # Gaussian NB has no hyperparameters to tune
        },
        'Logistic Regression': {
            'model': LogisticRegression(max_iter=1000, random_state=42),
            'params': {
                'C': [0.01, 0.1, 1, 10, 100],
                'solver': ['liblinear', 'lbfgs']
            }
        },
        'SVM': {
            'model': SVC(random_state=42),
            'params': {
                'C': [0.1, 1, 10],
                'kernel': ['linear', 'rbf'],
                'gamma': ['scale', 'auto', 0.1, 1]
            }
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        }
    }

    print(f"\n{'='*50}")
    print(f"Model Training and Evaluation for {dataset_name}")
    print(f"{'='*50}")

    # Train and evaluate each model
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for name, model_info in models.items():
        print(f"\nTraining {name}...")

        # Hyperparameter tuning with cross-validation
        if model_info['params']:
            grid_search = GridSearchCV(
                model_info['model'],
                model_info['params'],
                cv=cv,
                scoring='f1_macro',
                n_jobs=-1
            )
            grid_search.fit(X_train, y_train)

            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            print(f"Best parameters: {best_params}")
        else:
            best_model = model_info['model']
            best_model.fit(X_train, y_train)
            best_params = "N/A"

        # Make predictions on test set
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')

        print("\nPerformance metrics:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Macro F1-Score: {f1:.4f}")
        print(f"Macro Precision: {precision:.4f}")
        print(f"Macro Recall: {recall:.4f}")

        # Generate confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print("\nConfusion Matrix:")
        print(cm)

        # Generate classification report
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

        # Store results
        results[name] = {
            'best_params': best_params,
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'confusion_matrix': cm,
            'model': best_model
        }

    return results


# Function to visualize results

In [20]:
def visualize_results(results, dataset_name):
    print(f"\n{'='*50}")
    print(f"Results Visualization for {dataset_name}")
    print(f"{'='*50}")

    # Prepare data for plotting
    models = list(results.keys())
    accuracies = [results[model]['accuracy'] for model in models]
    f1_scores = [results[model]['f1'] for model in models]
    precisions = [results[model]['precision'] for model in models]
    recalls = [results[model]['recall'] for model in models]



    # Print results in a summary table
    results_df = pd.DataFrame({
        'Model': models,
        'Accuracy': accuracies,
        'F1-Score': f1_scores,
        'Precision': precisions,
        'Recall': recalls
    })

    print("\nSummary of Results:")
    print(results_df.to_string(index=False))

    # Find the best model based on F1-score
    best_model_idx = np.argmax(f1_scores)
    print(f"\nBest model for {dataset_name} based on F1-score: {models[best_model_idx]}")
    print(f"Best F1-score: {f1_scores[best_model_idx]:.4f}")

    return results_df

# Main


In [25]:
print("======================================================")
print("UCI Machine Learning Repository - Classification Analysis")
print("======================================================")

# Load datasets
banknote_df, haberman_df = load_datasets()

# Explore datasets
print("\nLoading datasets...")
explore_dataset(banknote_df, "Banknote Authentication")
explore_dataset(haberman_df, "Haberman's Survival")

# Dictionary to store all results
all_results = {}
summary_dfs = {}

# Process Banknote Authentication dataset
print("\nProcessing Banknote Authentication dataset...")
X_train, X_test, y_train, y_test = preprocess_dataset(banknote_df)
banknote_results = train_evaluate_models(X_train, X_test, y_train, y_test, "Banknote Authentication")
summary_dfs["Banknote"] = visualize_results(banknote_results, "Banknote Authentication")
all_results["Banknote"] = banknote_results

# Process Haberman's Survival dataset
print("\nProcessing Haberman's Survival dataset...")
X_train, X_test, y_train, y_test = preprocess_dataset(haberman_df)
haberman_results = train_evaluate_models(X_train, X_test, y_train, y_test, "Haberman's Survival")
summary_dfs["Haberman"] = visualize_results(haberman_results, "Haberman's Survival")
all_results["Haberman"] = haberman_results

# Compare performances across datasets
print(f"\n{'#'*50}")
print("\nComparison of model performances across datasets:")
for model in summary_dfs["Banknote"]["Model"]:
  print(f"\n{model}:")
  print(f"  Banknote F1-Score: {summary_dfs['Banknote'].loc[summary_dfs['Banknote']['Model'] == model, 'F1-Score'].values[0]:.4f}")
  print(f"  Haberman F1-Score: {summary_dfs['Haberman'].loc[summary_dfs['Haberman']['Model'] == model, 'F1-Score'].values[0]:.4f}")

UCI Machine Learning Repository - Classification Analysis

Loading datasets...

Dataset: Banknote Authentication

First 5 rows:
   variance  skewness  curtosis  entropy  class
0   3.62160    8.6661   -2.8073 -0.44699      0
1   4.54590    8.1674   -2.4586 -1.46210      0
2   3.86600   -2.6383    1.9242  0.10645      0
3   3.45660    9.5228   -4.0112 -3.59440      0
4   0.32924   -4.4552    4.5718 -0.98880      0

Dataset shape: (1372, 5)

Data types:
variance    float64
skewness    float64
curtosis    float64
entropy     float64
class         int64
dtype: object

Basic statistics:
          variance     skewness     curtosis      entropy        class
count  1372.000000  1372.000000  1372.000000  1372.000000  1372.000000
mean      0.433735     1.922353     1.397627    -1.191657     0.444606
std       2.842763     5.869047     4.310030     2.101013     0.497103
min      -7.042100   -13.773100    -5.286100    -8.548200     0.000000
25%      -1.773000    -1.708200    -1.574975    -2.413450