## DATA 1030 Project

- UCI SECOM Dataset

### Environment Set-up

In [18]:
# data wrangling
import numpy as np 
import pandas as pd 

# plot
import matplotlib.pyplot as plt
import seaborn as sns

# data prep
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score,recall_score,roc_auc_score


# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import Lasso
from sklearn.svm import OneClassSVM

# to avoid warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Load data

In [2]:
# reading the data
data = pd.read_csv('../data/uci-secom.csv')

# we have 1,567 rows and 592 columns
print(data.shape)

(1567, 592)


### Data Preparatinon 

- **Imbalanced data**: 93.4% (Pass) / 6.6% (Fail)

- **Univariate features**: 116 (dropped before split)

- **CV**: K-fold / Stratified-K-Fold

- **Standardization**: StandardScalar (All numerical values)

- **Missing Data Imputation**:  Since the absence of a signal (the feature value) is assumed to be no signal, here we replace the null valeus with 0.

In [15]:
# 1. Missing data: replace with 0
data = data.replace(np.NaN, 0)

# 2. Drop Unique Value Columns
unique_value_columns = data.columns[data.nunique() == 1]
data_cleaned = data.drop(columns=unique_value_columns) 
print("Data Shape after dropping univariate columns:", data_cleaned.shape)

# 3. Select X,Y data
y = data_cleaned['Pass/Fail']
X = data_cleaned.drop(columns=['Pass/Fail','Time'])
print("X shape:",X.shape)


Data Shape after dropping univariate columns: (1567, 480)
X shape: (1567, 478)


### ML pipeline Function

1. Split: Other/Test (80/20), with stratify = y
2. CV: Kfolds (K=4)
3. Evaluation Metrics: F-1 Score
    - This is due to the imblanced dataset, accuracy is not proper to use here
    - Recall, and BER (balanced error rate) will also be calculated

4. 

In [27]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import f1_score, make_scorer,accuracy_score

def MLpipe_KFold_f1_score(X, y, preprocessor, ML_algo, param_grid):
    '''
    This function splits the data into other/test (80/20) and then applies KFold with 4 folds to 'other'.
    The F1 score is maximized through cross-validation during grid search.
    '''

    # Lists to be returned
    test_scores = []
    f1_scores = []
    best_models = []

    # Define random states for reproducibility
    random_states = range(5)

    for random_state in random_states:
        # Split the data while maintaining class distribution - use stratify = y
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)

        # Preprocess the data
        X_other_prep = preprocessor.fit_transform(X_other)
        X_test_prep = preprocessor.transform(X_test)


        # Create the model with GridSearchCV using F1-score as the scoring metric
        grid_search = GridSearchCV(
            estimator=ML_algo,
            param_grid=param_grid,
            cv=KFold(n_splits=4, shuffle=True, random_state=random_state),
            n_jobs=-1
        )

        # Fit the model using GridSearchCV
        grid_search.fit(X_other_prep, y_other)

        # Predict on the test set
        y_test_pred = grid_search.predict(X_test_prep)

        # Calculate the F1-score on the test set
        test_score = accuracy_score(y_test, y_test_pred)
        test_f1_scores = f1_score(y_test, y_test_pred, pos_label=1)

        # Append the test score and best model
        test_scores.append(test_score)
        f1_scores.append(test_f1_scores)
        best_models.append(grid_search.best_estimator_)

    return test_scores, f1_scores, best_models


In [None]:
# Example usage of the modified function
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


# Stadardize the data
preprocessor = StandardScaler()

# Test with logistic regression
ML_algo = LogisticRegression()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Call the modified function
test_scores, test_f1_scores, best_models = MLpipe_KFold_f1_score(X, y, preprocessor, ML_algo, param_grid)

# Analyze the results
print("Test Scores across different random states:", test_scores)
print("test f1 Scores across different random states:", test_f1_scores)
print("Average Test Score:", sum(test_scores) / len(test_scores))




Test Scores across different random states: [0.9299363057324841, 0.9363057324840764, 0.9235668789808917, 0.9299363057324841, 0.9299363057324841]
test f1 Scores across different random states: [0.0, 0.09090909090909091, 0.0, 0.0, 0.08333333333333333]
Average Test Score: 0.9299363057324841


In [24]:
print("Best Models:", best_models)

Best Models: [LogisticRegression(C=0.1, penalty='l1', solver='liblinear'), LogisticRegression(C=0.1, penalty='l1', solver='liblinear'), LogisticRegression(C=0.1, penalty='l1', solver='liblinear'), LogisticRegression(C=0.1, penalty='l1', solver='liblinear'), LogisticRegression(C=0.1, penalty='l1', solver='liblinear')]


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import f1_score, recall_score, balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay, make_scorer
import matplotlib.pyplot as plt

def MLpipe_KFold_f1_score(X, y, preprocessor, ML_algo, param_grid, method_name):
    '''
    This function splits the data into other/test (80/20) and then applies KFold with 4 folds to 'other'.
    The F1 score is maximized through cross-validation during grid search.
    It returns test accuracy scores, f1 scores, recall scores, BER scores, and best models.
    Additionally, it plots the confusion matrix for the best model.
    '''
    # Lists to be returned
    test_scores = []
    f1_scores = []
    recall_scores = []
    ber_scores = []
    best_models = []
    best_model_index = None
    highest_test_f1_score = -np.inf

    # Define random states for reproducibility
    random_states = range(5)

    for idx, random_state in enumerate(random_states):
        # Split the data while maintaining class distribution - use stratify=y
        X_other, X_test, y_other, y_test = train_test_split(
            X, y, test_size=0.2, random_state=random_state, stratify=y
        )

        # Preprocess the data
        X_other_prep = preprocessor.fit_transform(X_other)
        X_test_prep = preprocessor.transform(X_test)

        # Define the scoring metric for GridSearchCV
        f1_scorer = make_scorer(f1_score, pos_label=1)

        # Create the model with GridSearchCV using F1-score as the scoring metric
        grid_search = GridSearchCV(
            estimator=ML_algo,
            param_grid=param_grid,
            scoring=f1_scorer,
            cv=KFold(n_splits=4, shuffle=True, random_state=random_state),
            n_jobs=-1
        )

        # Fit the model using GridSearchCV
        grid_search.fit(X_other_prep, y_other)

        # Predict on the test set
        y_test_pred = grid_search.predict(X_test_prep)

        # Calculate evaluation metrics on the test set
        test_f1_score = f1_score(y_test, y_test_pred, pos_label=1)
        recall = recall_score(y_test, y_test_pred, pos_label=1)
        ber = 1 - balanced_accuracy_score(y_test, y_test_pred)  # BER = 1 - balanced accuracy

        # Append the test scores and best model
        test_scores.append(test_f1_score)
        f1_scores.append(test_f1_score)
        recall_scores.append(recall)
        ber_scores.append(ber)
        best_models.append(grid_search.best_estimator_)

        # Keep track of the best model based on test F1-score
        if test_f1_score > highest_test_f1_score:
            highest_test_f1_score = test_f1_score
            best_model_index = idx
            best_random_state = random_state
            best_y_test = y_test
            best_y_pred = y_test_pred

    # Plot confusion matrix for the best model
    cm = confusion_matrix(best_y_test, best_y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f"Confusion Matrix for {method_name} (Best Model)")
    plt.show()

    # Return the evaluation metrics and best models
    return test_scores, f1_scores, recall_scores, ber_scores, best_models


In [None]:
from sklearn.linear_model import LogisticRegression

ml_methods = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'param_grid': {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.1, 1, 10],
            'solver': ['liblinear'],
            'class_weight': ['balanced']
        }
    },
    'SVC':{
        'model': SVC(),
        'param_grid': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto'],
            'class_weight': ['balanced']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30],
            'class_weight': ['balanced']
        }
    },
    'XGBoost': {
        'model': XGBClassifier(),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30],
            'learning_rate': [0.01, 0.1, 1],
            'scale_pos_weight': [1, 10, 100]
        }
    }
}

