In [None]:
import matplotlib.pyplot as plt
import numpy as np
import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lgbm
import seaborn as sns
from scipy.stats import mstats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## Data

In [None]:
data_train=pd.read_csv('path_to_train_data')
data_test=pd.read_csv('path_to_test_data')

### 5-FOLD Cross Validation

In [None]:
# Function to create k-fold sets from the training data
def create_kfold_sets(data_train, n_splits=5):
    n = len(numeros_resto)
    image_count_per_split = int(n / n_splits)
    numbers = numeros_resto.copy()
    random.seed(42)  # Set the random seed for reproducibility
    random.shuffle(numbers)

    # Split the numbers into k folds
    fold1_numbers = numbers[:image_count_per_split]
    fold2_numbers = numbers[image_count_per_split:2*image_count_per_split]
    fold3_numbers = numbers[2*image_count_per_split:3*image_count_per_split]
    fold4_numbers = numbers[3*image_count_per_split:4*image_count_per_split]
    fold5_numbers = numbers[4*image_count_per_split:]

    # Create data subsets for each fold
    fold1_data = data_train[data_train['n_image'].isin(fold1_numbers)]
    fold2_data = data_train[data_train['n_image'].isin(fold2_numbers)]
    fold3_data = data_train[data_train['n_image'].isin(fold3_numbers)]
    fold4_data = data_train[data_train['n_image'].isin(fold4_numbers)]
    fold5_data = data_train[data_train['n_image'].isin(fold5_numbers)]

    return fold1_data, fold2_data, fold3_data, fold4_data, fold5_data

In [None]:
X1, X2, X3, X4, X5 = create_kfold_sets(data_train=data_train)
D = [X1, X2, X3, X4, X5]

# 1. WITHOUT FEATURE SELECTION

## Random Forest

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'N_estimators', 'Criterion', 'Min_samples_leaf',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Define parameters for Grid Search
n_estimators = [25, 50, 75, 100]
criterion = ['gini', 'entropy']
min_samples_leaf = [1, 2, 4, 8, 16, 32]

for i in n_estimators:
    for k in criterion:
        for d in min_samples_leaf:
            recalls = []
            precisions = []
            f1s = []
            accuracies = []
            specificities = []
            times = []
            for j in range(5):
                # Prepare test and train sets for this fold
                d_test = pd.concat([D[j]])
                y_test = d_test['label_binary']
                X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
                d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
                y_train = d_train['label_binary']
                X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])

                # Initialize and fit the model
                model = RandomForestClassifier(
                    n_estimators=i,
                    criterion=k,
                    min_samples_leaf=d,
                    bootstrap=True,
                    random_state=42)
                model.fit(X_train, y_train)

                # Measure execution time
                t0 = time.time()
                y_pred = model.predict(X_test)
                print(f'Model with {i} estimators:')
                labels = ('corrosion', 'no corrosion')
                cm = confusion_matrix(y_test, y_pred, labels=labels)
                print(f'Confusion matrix for {i} estimators: \n {cm}')

                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)
                accuracies.append(accuracy)
                recall = recall_score(y_test, y_pred, average='binary', pos_label='corrosion')
                recall = round(recall, 3)
                print(f'Recall for {i} estimators: {recall}')
                recalls.append(recall)
                specificity = recall_score(y_test, y_pred, average='binary', pos_label='no corrosion')
                specificity = round(specificity, 3)
                specificities.append(specificity)
                print(f'Specificity: {specificity}')
                precision = precision_score(y_test, y_pred, average='binary', pos_label='corrosion')
                precision = round(precision, 3)
                precisions.append(precision)
                f1 = f1_score(y_test, y_pred, average='binary', pos_label='corrosion')
                f1 = round(f1, 3)
                f1s.append(f1)
                t1 = time.time()
                time_taken = t1 - t0
                time_taken = round(time_taken, 3)
                times.append(time_taken)
                print(f'Execution time: {time_taken} seconds')
                print('\n')

            # Calculate mean metrics across folds
            recall_mean = np.mean(recalls)
            specificity_mean = np.mean(specificities)
            precision_mean = np.mean(precisions)
            f1_mean = np.mean(f1s)
            accuracy_mean = np.mean(accuracies)
            time_mean = np.mean(times)

            # Append results to DataFrame
            result_i = {'Model': 'Random Forest', 'Accuracy': accuracy_mean, 'N_estimators': i, 'Criterion': k, 'Min_samples_leaf': d,
                        'Recall': recall_mean, 'Specificity': specificity_mean, 'Precision': precision_mean, 'F1': f1_mean,
                        'Time': time_mean}
            df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
df_resultados.sort_values(by='Recall', ascending=False)

Chosen model

In [None]:
# Parameters
min_samples = 1
n_estimators = 50
criterion = 'entropy'

# Assuming data_train and data_test are already defined
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']
X_test = data_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_test = data_test['label_binary']

# Initialize and train the model
model = RandomForestClassifier(
    n_estimators=n_estimators,
    criterion=criterion,
    min_samples_leaf=min_samples,
    bootstrap=True,
    random_state=42
)
model.fit(X_train, y_train)

# Measure execution time
t0 = time.time()
y_pred = model.predict(X_test)
t1 = time.time()

# Metrics calculation
labels = ('corrosion', 'no corrosion')
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(f'Model with {n_estimators} estimators:')
print(f'Confusion matrix for {n_estimators} estimators: \n {cm}')

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy for {n_estimators} estimators: {accuracy}')
recall = recall_score(y_test, y_pred, average='binary', pos_label='corrosion')
recall = round(recall, 3)
print(f'Recall for {n_estimators} estimators: {recall}')
specificity = recall_score(y_test, y_pred, average='binary', pos_label='no corrosion')
specificity = round(specificity, 3)
print(f'Specificity: {specificity}')
precision = precision_score(y_test, y_pred, average='binary', pos_label='corrosion')
precision = round(precision, 3)
print(f'Precision for {n_estimators} estimators: {precision}')
f1 = f1_score(y_test, y_pred, average='binary', pos_label='corrosion')
f1 = round(f1, 3)
print(f'F1 for {n_estimators} estimators: {f1}')
time_taken = t1 - t0
time_taken = round(time_taken, 3)
print(f'Execution time: {time_taken} seconds')

# Create a DataFrame to store results
columns = ['Model', 'N_estimators', 'Criterion', 'Min_samples_leaf',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Append results to DataFrame
result_rf = {'Model': 'Random Forest', 'Accuracy': accuracy, 'N_estimators': n_estimators, 'Criterion': criterion, 'Min_samples_leaf': min_samples,
             'Recall': recall, 'Specificity': specificity, 'Precision': precision, 'F1': f1,
             'Time': time_taken}
df_results = pd.concat([df_results, pd.DataFrame([result_rf])], ignore_index=True)


In [None]:
# Get feature importances from the trained model
importancias = model.feature_importances_

# Plot feature importances as a bar chart
plt.figure(figsize=(10, 6))
plt.bar(X_train.columns, importancias, color='skyblue')
plt.xticks(rotation=90, fontsize=10)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Importance', fontsize=12)
plt.title('Feature Importances from Random Forest', fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Create a DataFrame from feature importances
importancias_df = pd.DataFrame(importancias, index=X_train.columns, columns=['Importance'])

# Sort the DataFrame by importance in descending order
sorted_importances_df = importancias_df.sort_values(by='Importance', ascending=False)
print(sorted_importances_df)


## XGBOOST

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'N_estimators', 'Learning_rate', 'Min_child_weight',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Define parameters for Grid Search
n_estimators = [25, 50, 75, 100, 125, 150]
metric = ['logloss']
learning_rates = [0.01, 0.05, 0.1, 0.5]
min_child_weight = [1, 2, 4, 8, 16, 32]

for i in n_estimators:
    for d in learning_rates:
        for k in min_child_weight:
            for m in metric:
                recalls = []
                specificities = []
                precisions = []
                f1s = []
                accuracies = []
                times = []
                for j in range(5):
                    d_test = pd.concat([D[j]])
                    y_test = d_test['label_binary']
                    X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
                    d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
                    y_train = d_train['label_binary']
                    X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
                    mapping = {'no corrosion': 0, 'corrosion': 1}
                    y_train = y_train.map(mapping)
                    y_test = y_test.map(mapping)

                    # Initialize and train the XGBoost model
                    model = xgb(
                        objective='binary:logistic',  # Binary classification
                        n_estimators=i,             # Number of trees (boosting rounds)
                        seed=42,                       # For reproducibility
                        learning_rate=d,
                        min_child_weight=k,
                        eval_metric=m
                    )
                    model.fit(X_train, y_train)

                    # Measure execution time
                    t0 = time.time()
                    y_pred = model.predict(X_test)
                    print(f'Model with {i} estimators, {d} learning rate, and {k} min child weight:')
                    labels = (1, 0)
                    cm = confusion_matrix(y_test, y_pred, labels=labels)
                    print(f'Confusion matrix for {i} estimators: \n {cm}')

                    # Calculate metrics
                    accuracy = accuracy_score(y_test, y_pred)
                    accuracies.append(accuracy)
                    recall = recall_score(y_test, y_pred, average='binary')
                    recall = round(recall, 3)
                    print(f'Recall for {i} estimators: {recall}')
                    recalls.append(recall)
                    specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
                    specificity = round(specificity, 3)
                    specificities.append(specificity)
                    print(f'Specificity: {specificity}')
                    precision = precision_score(y_test, y_pred, average='binary')
                    precision = round(precision, 3)
                    precisions.append(precision)
                    f1 = f1_score(y_test, y_pred, average='binary')
                    f1 = round(f1, 3)
                    f1s.append(f1)
                    t1 = time.time()
                    time_taken = t1 - t0
                    time_taken = round(time_taken, 3)
                    times.append(time_taken)
                    print(f'Execution time: {time_taken} seconds')
                    print('\n')

                # Calculate mean metrics across folds
                recall_mean = np.mean(recalls)
                specificity_mean = np.mean(specificities)
                precision_mean = np.mean(precisions)
                f1_mean = np.mean(f1s)
                accuracy_mean = np.mean(accuracies)
                time_mean = np.mean(times)

                # Append results to DataFrame
                result_i = {'Model': 'XGBoost', 'Accuracy': accuracy_mean, 'N_estimators': i, 'Learning_rate': d, 'Min_child_weight': k,
                            'Recall': recall_mean, 'Specificity': specificity_mean,
                            'Precision': precision_mean, 'F1': f1_mean,
                            'Time': time_mean}
                df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
df_results.sort_values(by='Recall', ascending=False)

Chosen model

In [None]:
# Parameters
n_estimators = 75
learning_rate = 0.01
min_child_weight = 32

# Initialize and train the XGBoost model
model = xgb(
    objective='binary:logistic',  # Binary classification
    n_estimators=n_estimators,     # Number of trees (boosting rounds)
    seed=42,                       # For reproducibility
    learning_rate=learning_rate,
    min_child_weight=min_child_weight,
    eval_metric='logloss'
)

# Prepare data
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']
X_test = data_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_test = data_test['label_binary']

# Map labels to numeric values
label_mapping = {'no corrosion': 0, 'corrosion': 1}
y_train = y_train.map(label_mapping)
y_test = y_test.map(label_mapping)

# Train the model
model.fit(X_train, y_train)

# Measure execution time
t0 = time.time()
y_pred = model.predict(X_test)
t1 = time.time()

# Print model details and metrics
print(f'Model with {n_estimators} estimators, {learning_rate} learning rate, and {min_child_weight} min child weight:')
labels = (1, 0)
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(f'Confusion matrix for {n_estimators} estimators: \n {cm}')

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy for {n_estimators} estimators: {accuracy}')
recall = recall_score(y_test, y_pred, average='binary')
recall = round(recall, 3)
print(f'Recall for {n_estimators} estimators: {recall}')
specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
specificity = round(specificity, 3)
print(f'Specificity: {specificity}')
precision = precision_score(y_test, y_pred, average='binary')
precision = round(precision, 3)
print(f'Precision for {n_estimators} estimators: {precision}')
f1 = f1_score(y_test, y_pred, average='binary')
f1 = round(f1, 3)
print(f'F1 for {n_estimators} estimators: {f1}')
time_taken = t1 - t0
time_taken = round(time_taken, 3)
print(f'Execution time: {time_taken} seconds')

# Create a DataFrame to store results
columns = ['Model', 'N_estimators', 'Learning_rate', 'Min_child_weight', 'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Append results to DataFrame
result_xgb = {'Model': 'XGBoost', 'Accuracy': accuracy, 'N_estimators': n_estimators, 'Learning_rate': learning_rate, 'Min_child_weight': min_child_weight,
              'Recall': recall, 'Specificity': specificity, 'Precision': precision, 'F1': f1,
              'Time': time_taken}
df_results = pd.concat([df_results, pd.DataFrame([result_xgb])], ignore_index=True)
print(df_results)


## LIGHTGBM GBDT

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'N_estimators', 'Learning_rate', 'N_bins', 'Num_leaves', 'Accuracy',
           'Metric',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Define parameters for Grid Search
n_estimators = [25, 50, 75, 100, 125, 150]
metric = ['logloss']
learning_rates = [0.025, 0.05, 0.1, 0.2, 0.4]
n_bins = [255]
num_leaves = [10, 30, 50]

for i in n_estimators:
    for d in learning_rates:
        for k in n_bins:
            for l in num_leaves:
                for m in metric:
                    recalls = []
                    specificities = []
                    precisions = []
                    f1s = []
                    accuracies = []
                    times = []
                    for j in range(5):
                        # Prepare test and train sets for this fold
                        d_test = pd.concat([D[j]])
                        y_test = d_test['label_binary']
                        X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
                        d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
                        y_train = d_train['label_binary']
                        X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
                        label_mapping = {'no corrosion': 0, 'corrosion': 1}
                        y_train = y_train.map(label_mapping)
                        y_test = y_test.map(label_mapping)

                        # Initialize and train the LightGBM model
                        model = lgbm(
                            objective='binary',  # Binary classification
                            boosting_type='gbdt',  # Boosting type
                            n_estimators=i,             # Number of trees (boosting rounds)
                            seed=42,                       # For reproducibility
                            learning_rate=d,
                            n_bins=k,
                            num_leaves=l,
                            eval_metric=m,
                            verbose=-1
                        )
                        model.fit(X_train, y_train)

                        # Measure execution time
                        t0 = time.time()
                        y_pred = model.predict(X_test)
                        print(f'Model with {i} estimators, {d} learning rate, and {l} leaves:')
                        labels = (1, 0)
                        cm = confusion_matrix(y_test, y_pred, labels=labels)
                        print(f'Confusion matrix for {i} estimators: \n {cm}')

                        # Calculate metrics
                        accuracy = accuracy_score(y_test, y_pred)
                        accuracies.append(accuracy)
                        recall = recall_score(y_test, y_pred, average='binary')
                        recall = round(recall, 3)
                        print(f'Recall for {i} estimators: {recall}')
                        recalls.append(recall)
                        specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
                        specificity = round(specificity, 3)
                        specificities.append(specificity)
                        print(f'Specificity: {specificity}')
                        precision = precision_score(y_test, y_pred, average='binary')
                        precision = round(precision, 3)
                        precisions.append(precision)
                        f1 = f1_score(y_test, y_pred, average='binary')
                        f1 = round(f1, 3)
                        f1s.append(f1)
                        t1 = time.time()
                        time_taken = t1 - t0
                        time_taken = round(time_taken, 3)
                        times.append(time_taken)
                        print(f'Execution time: {time_taken} seconds')
                        print('\n')

                    # Calculate mean metrics across folds
                    recall_mean = np.mean(recalls)
                    specificity_mean = np.mean(specificities)
                    precision_mean = np.mean(precisions)
                    f1_mean = np.mean(f1s)
                    accuracy_mean = np.mean(accuracies)
                    time_mean = np.mean(times)

                    # Append results to DataFrame
                    result_i = {'Model': 'LGBM', 'Accuracy': accuracy_mean, 'N_estimators': i, 'Learning_rate': d, 'N_bins': k, 'Num_leaves': l,
                                'Metric': m,
                                'Recall': recall_mean, 'Specificity': specificity_mean,
                                'Precision': precision_mean, 'F1': f1_mean,
                                'Time': time_mean}
                    df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
df_resultados.sort_values(by='Recall', ascending=False)

Chosen model

In [None]:
# Parameters
n_estimators = 25
learning_rate = 0.025
n_bins = 255
num_leaves = 10

# Prepare data
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']
label_mapping = {'no corrosion': 0, 'corrosion': 1}
y_train = y_train.map(label_mapping)
X_test = data_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_test = data_test['label_binary']
y_test = y_test.map(label_mapping)

# Initialize and train the LightGBM model
model = lgbm(
    objective='binary',  # Binary classification
    boosting_type='gbdt',  # Boosting type
    n_estimators=n_estimators,             # Number of trees (boosting rounds)
    seed=42,                       # For reproducibility
    learning_rate=learning_rate,
    eval_metric='logloss',
    verbose=-1,
    n_bins=n_bins,
    num_leaves=num_leaves
)
model.fit(X_train, y_train)

# Measure execution time
t0 = time.time()
y_pred = model.predict(X_test)
t1 = time.time()
time_taken = t1 - t0
time_taken = round(time_taken, 3)

# Print model details and metrics
print(f'Model with {n_estimators} estimators:')
labels = (1, 0)
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(f'Confusion matrix for estimators: \n {cm}')
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='binary')
recall = round(recall, 3)
print(f'Recall for estimators: {recall}')
print(f'Execution time: {time_taken} seconds')
precision = precision_score(y_test, y_pred, average='binary')
precision = round(precision, 3)
specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
specificity = round(specificity, 3)
f1 = f1_score(y_test, y_pred, average='binary')
f1 = round(f1, 3)
print(f'Specificity: {specificity}')
print(f'Precision: {precision}')
print(f'F1: {f1}')

# Create a DataFrame to store results
columns = ['Model', 'N_estimators', 'Accuracy', 'Learning_rate', 'N_bins', 'Num_leaves',
           'Metric',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Append results to DataFrame
result_i = {'Model': 'LightGBM', 'N_estimators': n_estimators, 'Learning_rate': learning_rate, 'N_bins': n_bins, 'Num_leaves': num_leaves,
            'Metric': 'logloss', 'Accuracy': accuracy,
            'Recall': recall, 'Specificity': specificity,
            'Precision': precision, 'F1': f1,
            'Time': time_taken}
df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)

print(df_results)


## LIGHTGBM GOSS

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'Top_rate', 'Other_rate', 'Learning_rate', 'Num_leaves',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Define parameters for Grid Search
top_rate = [0.2, 0.4, 0.6]
other_rate = [0.05, 0.1, 0.3]  # Corrected to be a list
learning_rates = [0.025, 0.05, 0.1, 0.2]
num_leaves = [10, 30, 50]

for i in top_rate:
    for d in learning_rates:
        for k in other_rate:
            for l in num_leaves:
                recalls = []
                specificities = []
                precisions = []
                f1s = []
                accuracies = []
                times = []
                for j in range(5):
                    # Prepare test and train sets for this fold
                    d_test = pd.concat([D[j]])
                    y_test = d_test['label_binary']
                    X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
                    d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
                    y_train = d_train['label_binary']
                    X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
                    label_mapping = {'no corrosion': 0, 'corrosion': 1}
                    y_train = y_train.map(label_mapping)
                    y_test = y_test.map(label_mapping)

                    # Initialize and train the LightGBM model
                    model = lgbm(
                        objective='binary',  # Binary classification
                        boosting_type='goss',  # Boosting type
                        top_rate=i,             # Top rate
                        seed=42,                       # For reproducibility
                        learning_rate=d,
                        other_rate=k,
                        num_leaves=l,
                        verbose=-1
                    )
                    model.fit(X_train, y_train)

                    # Measure execution time
                    t0 = time.time()
                    y_pred = model.predict(X_test)
                    print(f'Model with {i} top rate, {d} learning rate, and {l} leaves:')
                    labels = (1, 0)
                    cm = confusion_matrix(y_test, y_pred, labels=labels)
                    print(f'Confusion matrix for {i} top rate: \n {cm}')

                    # Calculate metrics
                    accuracy = accuracy_score(y_test, y_pred)
                    accuracies.append(accuracy)
                    recall = recall_score(y_test, y_pred, average='binary')
                    recall = round(recall, 3)
                    print(f'Recall for {i} top rate: {recall}')
                    recalls.append(recall)
                    specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
                    specificity = round(specificity, 3)
                    specificities.append(specificity)
                    print(f'Specificity: {specificity}')
                    precision = precision_score(y_test, y_pred, average='binary')
                    precision = round(precision, 3)
                    precisions.append(precision)
                    f1 = f1_score(y_test, y_pred, average='binary')
                    f1 = round(f1, 3)
                    f1s.append(f1)
                    t1 = time.time()
                    time_taken = t1 - t0
                    time_taken = round(time_taken, 3)
                    times.append(time_taken)
                    print(f'Execution time: {time_taken} seconds')
                    print('\n')

                # Calculate mean metrics across folds
                recall_mean = np.mean(recalls)
                specificity_mean = np.mean(specificities)
                precision_mean = np.mean(precisions)
                f1_mean = np.mean(f1s)
                accuracy_mean = np.mean(accuracies)
                time_mean = np.mean(times)

                # Append results to DataFrame
                result_i = {'Model': 'LightGBM', 'Accuracy': accuracy_mean, 'Top_rate': i, 'Learning_rate': d, 'Other_rate': k, 'Num_leaves': l,
                            'Recall': recall_mean, 'Specificity': specificity_mean,
                            'Precision': precision_mean, 'F1': f1_mean,
                            'Time': time_mean}
                df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
df_resultados.sort_values(by='Recall', ascending=False)

Chosen model

In [None]:
# Model parameters
top_rate = 0.2
learning_rate = 0.05
other_rate = 0.05
num_leaves = 10

# Prepare training and testing data
X_train = data_train.drop(columns=['label_binary', 'label_multi', 'n_image'])
y_train = data_train['label_binary']
label_mapping = {'no corrosion': 0, 'corrosion': 1}
y_train = y_train.map(label_mapping)

X_test = data_test.drop(columns=['label_binary', 'label_multi', 'n_image'])
y_test = data_test['label_binary']
y_test = y_test.map(label_mapping)

# Initialize and train the LightGBM model
model = lgbm(
    objective='binary',  # Binary classification
    boosting_type='goss',  # Boosting type
    top_rate=top_rate,             # Top rate
    seed=42,                       # For reproducibility
    learning_rate=learning_rate,
    other_rate=other_rate,
    num_leaves=num_leaves,
    verbose=-1
)
model.fit(X_train, y_train)

# Measure execution time
t0 = time.time()
y_pred = model.predict(X_test)
t1 = time.time()

# Print model details and metrics
print(f'Model with {num_leaves} leaves, {learning_rate} learning rate, {top_rate} top rate, and {other_rate} other rate:')
labels = (1, 0)
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(f'Confusion matrix: \n {cm}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
recall = recall_score(y_test, y_pred, average='binary')
recall = round(recall, 3)
print(f'Recall: {recall}')
specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
specificity = round(specificity, 3)
print(f'Specificity: {specificity}')
precision = precision_score(y_test, y_pred, average='binary')
precision = round(precision, 3)
print(f'Precision: {precision}')
f1 = f1_score(y_test, y_pred, average='binary')
f1 = round(f1, 3)
print(f'F1: {f1}')
time_taken = t1 - t0
time_taken = round(time_taken, 3)
print(f'Execution time: {time_taken} seconds')
print('\n')

# Create a DataFrame to store results
columns = ['Model', 'Learning_rate', 'Num_leaves', 'Top_rate', 'Other_rate',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Append results to DataFrame
result_lgbm = {'Model': 'LightGBM', 'Accuracy': accuracy, 'Top_rate': top_rate, 'Other_rate': other_rate, 'Learning_rate': learning_rate, 'Num_leaves': num_leaves,
               'Recall': recall, 'Specificity': specificity, 'Precision': precision, 'F1': f1,
               'Time': time_taken}
df_results = pd.concat([df_results, pd.DataFrame([result_lgbm])], ignore_index=True)
print(df_results)


## KNN

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'N_neighbors',  'Distance', 'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# List of neighbors to test
N_neighbors = [1, 3, 5, 11, 21, 41, 61]

# List of distance metrics to use
distance_metrics = ['euclidean', 'manhattan']

# Iterate over each number of neighbors and distance metric
for i in N_neighbors:
    for d in distance_metrics:
        # Lists to store results across multiple iterations
        recalls = []
        specificities = []
        precisions = []
        f1s = []
        accuracies = []
        times = []

        # Perform cross-validation by iterating over each fold
        for j in range(5):

            # Select the test dataset for this iteration
            d_test = pd.concat([D[j]])
            y_test = d_test['label_binary']
            X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])

            # Select the training datasets for this iteration
            d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
            y_train = d_train['label_binary']
            X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])

            # Scale the data using StandardScaler
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # Initialize and fit the KNeighborsClassifier model
            model = KNeighborsClassifier(n_neighbors=i, metric=d)
            model.fit(X_train, y_train)

            # Measure the time taken to make predictions
            t0 = time.time()
            y_pred = model.predict(X_test)
            t1 = time.time()

            # Print model details
            print(f'Model with {i} neighbors using {d} distance:')
            labels = ('corrosion', 'no corrosion')
            cm = confusion_matrix(y_test, y_pred, labels=labels)
            print(f'Confusion matrix for {i} neighbors:\n {cm}')

            # Calculate and store accuracy
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

            # Calculate and store recall
            recall = recall_score(y_test, y_pred, average='binary', pos_label='corrosion')
            recall = round(recall, 3)
            print(f'Recall for {i} neighbors: {recall}')
            recalls.append(recall)

            # Calculate and store specificity
            specificity = recall_score(y_test, y_pred, average='binary', pos_label='no corrosion')
            specificity = round(specificity, 3)
            print(f'Specificity: {specificity}')
            specificities.append(specificity)

            # Calculate and store precision
            precision = precision_score(y_test, y_pred, average='binary', pos_label='corrosion')
            precision = round(precision, 3)
            precisions.append(precision)

            # Calculate and store F1 score
            f1 = f1_score(y_test, y_pred, average='binary', pos_label='corrosion')
            f1 = round(f1, 3)
            f1s.append(f1)

            # Calculate and store time taken
            time_taken = t1 - t0
            time_taken = round(time_taken, 3)
            times.append(time_taken)
            print(f'Execution time: {time_taken} seconds')
            print('\n')

        # Calculate mean metrics across all iterations
        recall_mean = np.mean(recalls)
        specificity_mean = np.mean(specificities)
        precision_mean = np.mean(precisions)
        f1_mean = np.mean(f1s)
        accuracy_mean = np.mean(accuracies)
        time_mean = np.mean(times)

        # Create a dictionary to store the results for this model configuration
        result_i = {
            'Model': 'KNN',
            'Accuracy': accuracy_mean,
            'N_neighbors': i,
            'Distance': d,
            'Recall': recall_mean,
            'Specificity': specificity_mean,
            'Precision': precision_mean,
            'F1': f1_mean,
            'Time': time_mean
        }

        # Append the results to the DataFrame
        df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
K = 61                        # Number of neighbors
DISTANCE_METRIC = 'manhattan' # Distance metric for KNN
LABELS = ('corrosion', 'no corrosion')  # Class labels

# ========================
# Data Preparation
# ========================
# Prepare training data
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']

# Prepare test data
X_test = data_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_test = data_test['label_binary']

# ========================
# Feature Scaling
# ========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================
# Model Training
# ========================
model = KNeighborsClassifier(
    n_neighbors=K,
    metric=DISTANCE_METRIC
)
model.fit(X_train_scaled, y_train)

# ========================
# Model Evaluation
# ========================
# Time prediction process
start_time = time.time()
y_pred = model.predict(X_test_scaled)
execution_time = round(time.time() - start_time, 3)

# Calculate evaluation metrics
cm = confusion_matrix(y_test, y_pred, labels=LABELS)
accuracy = accuracy_score(y_test, y_pred)
recall = round(recall_score(y_test, y_pred, average='binary', pos_label='corrosion'), 3)
specificity = round(recall_score(y_test, y_pred, average='binary', pos_label='no corrosion'), 3)
precision = round(precision_score(y_test, y_pred, average='binary', pos_label='corrosion'), 3)
f1 = round(f1_score(y_test, y_pred, average='binary', pos_label='corrosion'), 3)

# ========================
# Results Display
# ========================
print(f'\nModel with {K} neighbors using {DISTANCE_METRIC} distance:')
print(f'Confusion Matrix:\n{cm}')
print(f'Accuracy: {accuracy:.3f}')
print(f'Recall (Corrosion Detection): {recall}')
print(f'Specificity (No Corrosion): {specificity}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')
print(f'Execution Time: {execution_time} seconds')

# ========================
# Results Storage
# ========================
results_columns = ['Model', 'N_neighbors', 'Distance', 'Accuracy',
                  'Recall', 'Specificity', 'Precision', 'F1', 'Time']

results_data = {
    'Model': 'KNN',
    'N_neighbors': K,
    'Distance': DISTANCE_METRIC,
    'Accuracy': accuracy,
    'Recall': recall,
    'Specificity': specificity,
    'Precision': precision,
    'F1': f1,
    'Time': execution_time
}

df_final_results = pd.DataFrame([results_data])
print('\nFinal Results DataFrame:')
print(df_final_results)

## Logistic regression

In [None]:
# Define results DataFrame structure
results_columns = ['Model', 'Accuracy', 'Recall', 'Specificity',
                  'Precision', 'F1', 'Time']
df_results = pd.DataFrame(columns=results_columns)

# Configuration parameters
VIF_THRESHOLD = 10
PVAL_THRESHOLD = 0.01
MAX_ITERATIONS = 100

def perform_feature_selection(X_train, y_train):
    """
    Perform iterative feature selection using p-values and VIF analysis.

    Args:
        X_train: Training features DataFrame
        y_train: Training labels Series

    Returns:
        Tuple: (Final model, List of selected features)
    """
    current_features = X_train.columns.tolist()
    removed_features = []

    # First phase: Remove features with high p-values
    while True:
        model = sm.Logit(y_train, X_train[current_features]).fit(
            disp=0,
            maxiter=MAX_ITERATIONS
        )

        # Get highest p-value feature
        p_values = model.pvalues
        max_p_feature = p_values.idxmax()
        max_p_value = p_values.max()

        if max_p_value < PVAL_THRESHOLD:
            break

        print(f"Removing feature: {max_p_feature} (p-value: {max_p_value:.3f})")
        current_features.remove(max_p_feature)

    # Second phase: Remove features with high multicollinearity
    while True:
        # Calculate VIF for remaining features
        vif_data = pd.DataFrame()
        vif_data['Feature'] = current_features
        vif_data['VIF'] = [variance_inflation_factor(X_train[current_features].values, i)
                          for i in range(len(current_features))]

        max_vif = vif_data['VIF'].max()
        max_vif_feature = vif_data.loc[vif_data['VIF'].idxmax(), 'Feature']

        if max_vif < VIF_THRESHOLD:
            print("All VIF values below threshold")
            break

        print(f"Removing feature: {max_vif_feature} (VIF: {max_vif:.1f})")
        current_features.remove(max_vif_feature)

    return model, current_features

# Main cross-validation loop
for fold_idx in tqdm(range(5), desc="Processing folds"):
    # Split data into train/test sets
    test_data = pd.concat([D[fold_idx]])
    train_data = pd.concat([d for i, d in enumerate(D) if i != fold_idx])

    # Prepare datasets
    y_test = test_data['label_binary']
    X_test = test_data.drop(columns=['label_binary', 'n_image', 'label_multi'])

    y_train = train_data['label_binary']
    X_train = train_data.drop(columns=['label_binary', 'n_image', 'label_multi'])

    # Encode labels to numerical values
    label_mapping = {'no corrosion': 0, 'corrosion': 1}
    y_train = y_train.map(label_mapping)
    y_test = y_test.map(label_mapping)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert back to DataFrames with original column names
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    # Feature selection process
    final_model, selected_features = perform_feature_selection(X_train_scaled, y_train)

    # Make predictions
    start_time = time.time()
    X_test_final = X_test_scaled[selected_features]
    y_pred_proba = final_model.predict(X_test_final)
    y_pred = (y_pred_proba >= 0.5).astype(int)
    elapsed_time = time.time() - start_time

    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred, labels=[1, 0])
    print(f"\nConfusion matrix for fold {fold_idx+1}:\n{cm}")

    fold_metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred, pos_label=1),
        'specificity': recall_score(y_test, y_pred, pos_label=0),
        'precision': precision_score(y_test, y_pred, pos_label=1),
        'f1': f1_score(y_test, y_pred, pos_label=1),
        'time': elapsed_time
    }

    # Store metrics
    df_results = pd.concat([
        df_results,
        pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Accuracy': fold_metrics['accuracy'],
            'Recall': fold_metrics['recall'],
            'Specificity': fold_metrics['specificity'],
            'Precision': fold_metrics['precision'],
            'F1': fold_metrics['f1'],
            'Time': fold_metrics['time']
        }])
    ], ignore_index=True)

# Calculate mean metrics across all folds
mean_results = {
    'Model': 'Logistic Regression',
    'Accuracy': df_results['Accuracy'].mean(),
    'Recall': df_results['Recall'].mean(),
    'Specificity': df_results['Specificity'].mean(),
    'Precision': df_results['Precision'].mean(),
    'F1': df_results['F1'].mean(),
    'Time': df_results['Time'].mean()
}

# Add mean results to DataFrame
df_results = pd.concat([
    df_results,
    pd.DataFrame([mean_results])
], ignore_index=True)

print("\nFinal Results:")
print(df_results.round(3))

## SVM

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'C', 'Kernel', 'Accuracy', 'Recall', 'Specificity', 'Precision', 'F1', 'Time']
df_results = pd.DataFrame(columns=columns)

# Hyperparameter values for SVM
C_values = [0.001, 0.01, 0.1, 1]  # Regularization parameter values
kernels = ['linear', 'rbf']  # Kernel types to test

# Iterate over each combination of C and kernel
for c in C_values:
    for kernel in kernels:
        # Initialize lists to store metrics across folds
        recalls = []
        specificities = []
        precisions = []
        f1s = []
        accuracies = []
        times = []

        # Perform 5-fold cross-validation
        for fold in range(5):
            # Split data into training and testing sets for this fold
            test_data = pd.concat([D[fold]])
            y_test = test_data['label_binary']
            X_test = test_data.drop(columns=['label_binary', 'n_image', 'label_multi'])

            train_data = pd.concat([D[i] for i in range(5) if i != fold], ignore_index=True)
            y_train = train_data['label_binary']
            X_train = train_data.drop(columns=['label_binary', 'n_image', 'label_multi'])

            # Scale the features using StandardScaler
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Train the SVM model with the current hyperparameters
            model = SVC(C=c, kernel=kernel, verbose=False)
            model.fit(X_train_scaled, y_train)

            # Measure prediction time
            t0 = time.time()
            y_pred = model.predict(X_test_scaled)
            t1 = time.time()

            # Print model details and confusion matrix
            print(f'Model with C={c} and kernel={kernel}:')
            labels = ('corrosion', 'no corrosion')
            cm = confusion_matrix(y_test, y_pred, labels=labels)
            print(f'Confusion matrix for C={c}:\n{cm}')

            # Calculate metrics and store them
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

            recall = recall_score(y_test, y_pred, average='binary', pos_label='corrosion')
            recall = round(recall, 3)
            print(f'Recall for C={c}: {recall}')
            recalls.append(recall)

            specificity = recall_score(y_test, y_pred, average='binary', pos_label='no corrosion')
            specificity = round(specificity, 3)
            specificities.append(specificity)
            print(f'Specificity: {specificity}')

            precision = precision_score(y_test, y_pred, average='binary', pos_label='corrosion')
            precision = round(precision, 3)
            precisions.append(precision)

            f1 = f1_score(y_test, y_pred, average='binary', pos_label='corrosion')
            f1 = round(f1, 3)
            f1s.append(f1)

            time_taken = t1 - t0
            time_taken = round(time_taken, 3)
            times.append(time_taken)
            print(f'Execution time: {time_taken} seconds\n')

        # Calculate mean metrics across all folds
        mean_recall = np.mean(recalls)
        mean_specificity = np.mean(specificities)
        mean_precision = np.mean(precisions)
        mean_f1 = np.mean(f1s)
        mean_accuracy = np.mean(accuracies)
        mean_time = np.mean(times)

        # Create a dictionary to store the results for this configuration
        result_row = {
            'Model': 'SVM',
            'C': c,
            'Kernel': kernel,
            'Accuracy': mean_accuracy,
            'Recall': mean_recall,
            'Specificity': mean_specificity,
            'Precision': mean_precision,
            'F1': mean_f1,
            'Time': mean_time
        }

        # Append the results to the DataFrame
        df_results = pd.concat([df_results, pd.DataFrame([result_row])], ignore_index=True)

# Display final results summary
print("\nFinal Results:")
print(df_results.round(3))

In [None]:
C_VALUE = 0.1                # Regularization parameter
KERNEL_TYPE = 'rbf'          # Kernel type for SVM
LABEL_MAPPING = {            # Label encoding dictionary
    'no corrosion': 0,
    'corrosion': 1
}

# ========================
# Data Preparation
# ========================
# Separate features and labels for training data
X_train = data_train.drop(columns=['label_binary', 'label_multi', 'n_image'])
y_train = data_train['label_binary']

# Separate features and labels for test data
X_test = data_test.drop(columns=['label_binary', 'label_multi', 'n_image'])
y_test = data_test['label_binary']

# ========================
# Feature Scaling
# ========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================
# Label Encoding
# ========================
y_train_encoded = y_train.map(LABEL_MAPPING)
y_test_encoded = y_test.map(LABEL_MAPPING)

# ========================
# Model Training
# ========================
model = SVC(C=C_VALUE, kernel=KERNEL_TYPE, verbose=True)
model.fit(X_train_scaled, y_train_encoded)

# ========================
# Model Evaluation
# ========================
# Time prediction only
start_time = time.time()
y_pred = model.predict(X_test_scaled)
execution_time = round(time.time() - start_time, 3)

# Calculate evaluation metrics
cm = confusion_matrix(y_test_encoded, y_pred, labels=[1, 0])
accuracy = accuracy_score(y_test_encoded, y_pred)
recall = round(recall_score(y_test_encoded, y_pred, pos_label=1), 3)
specificity = round(recall_score(y_test_encoded, y_pred, pos_label=0), 3)
precision = round(precision_score(y_test_encoded, y_pred, pos_label=1), 3)
f1 = round(f1_score(y_test_encoded, y_pred, pos_label=1), 3)

# ========================
# Results Display
# ========================
print(f'\nModel with C={C_VALUE} and {KERNEL_TYPE} kernel:')
print(f'Confusion Matrix:\n{cm}')
print(f'Accuracy: {accuracy:.3f}')
print(f'Recall (Corrosion): {recall}')
print(f'Specificity (No Corrosion): {specificity}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')
print(f'Execution Time: {execution_time} seconds')

# ========================
# Results Storage
# ========================
results_columns = ['Model', 'C', 'Kernel', 'Accuracy',
                  'Recall', 'Specificity', 'Precision', 'F1', 'Time']
results_data = {
    'Model': 'SVM',
    'C': C_VALUE,
    'Kernel': KERNEL_TYPE,
    'Accuracy': accuracy,
    'Recall': recall,
    'Specificity': specificity,
    'Precision': precision,
    'F1': f1,
    'Time': execution_time
}

df_results = pd.DataFrame([results_data])
print('\nResults DataFrame:')
print(df_results)


[LibSVM]Modelo con 0.1 C y con kernelrbf:
Matriz de confusion para 0.1 estimadores: 
 [[18477  4450]
 [18153 49032]]
Recall para 0.1 estimadores: 0.806
Tiempo de ejecución: 1045.665 segundos
Especificidad: 0.73
Precision: 0.504
F1: 0.62


  df_resultados=pd.concat([df_resultados, pd.DataFrame([resultado_i])], ignore_index=True)


Unnamed: 0,Modelo,C,Kernel,Accuracy,Recall,Especificidad,Precision,F1,Tiempo
0,SVM,0.1,rbf,0.749168,0.806,0.73,0.504,0.62,1045.665


# WRAPPER

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
 =============================================
# Model Configuration & Feature Selection Setup
# =============================================
# Initialize LightGBM classifier with binary classification settings
lgbm_model = LGBMClassifier(
    objective='binary',        # Binary classification task
    boosting_type='gbdt',      # Gradient Boosting Decision Tree
    learning_rate=0.05,        # Shrinkage rate for updates
    num_leaves=10,             # Maximum number of leaves per tree
    n_estimators=50,           # Number of boosting rounds
    random_state=42,           # Seed for reproducibility
    n_jobs=-1,                 # Use all available cores
    eval_metric='logloss',     # Evaluation metric during training
    verbose=-1                 # Silence output
)

# Configure sequential feature selection
feature_selector = SFS(
    estimator=lgbm_model,
    k_features=(5, 25),        # Target feature range (min, max)
    forward=True,               # Forward selection approach
    floating=False,             # No floating selection
    scoring='f1',               # Optimization metric
    cv=5,                       # 5-fold cross-validation
    n_jobs=-1,                  # Use all available cores
    verbose=2                   # Medium verbosity
)

# ======================
# Data Preparation
# ======================
# Prepare training data (exclude non-feature columns)
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']

# Encode target labels (0: no corrosion, 1: corrosion)
label_mapping = {'no corrosion': 0, 'corrosion': 1}
y_train_encoded = y_train.map(label_mapping)

# ======================
# Feature Selection
# ======================
# Perform feature selection using training data
feature_selector.fit(X_train, y_train_encoded)

# ======================
# Results Extraction
# ======================
# Get selected feature names and convert to list
selected_features = list(feature_selector.k_feature_names_)
print("Selected features:", selected_features)

In [None]:
wrapper_features=['label_binary', 'n_image']+selected_features
data_train_wrapper=data_train[wrapper_features]
data_test_wrapper=data_test[wrapper_features]
data_train_wrapper_kf=data_train_wrapper.copy()
X1, X2, X3, X4, X5 = create_kfold_sets(data_train=data_train_wrapper_kf)
D = [X1, X2, X3, X4, X5]

## Random Forest

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'N_estimators', 'Criterion', 'Min_samples_leaf',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Define parameters for Grid Search
n_estimators = [25, 50, 75, 100]
criterion = ['gini', 'entropy']
min_samples_leaf = [1, 2, 4, 8, 16, 32]

for i in n_estimators:
    for k in criterion:
        for d in min_samples_leaf:
            recalls = []
            precisions = []
            f1s = []
            accuracies = []
            specificities = []
            times = []
            for j in range(5):
                # Prepare test and train sets for this fold
                d_test = pd.concat([D[j]])
                y_test = d_test['label_binary']
                X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
                d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
                y_train = d_train['label_binary']
                X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])

                # Initialize and fit the model
                model = RandomForestClassifier(
                    n_estimators=i,
                    criterion=k,
                    min_samples_leaf=d,
                    bootstrap=True,
                    random_state=42)
                model.fit(X_train, y_train)

                # Measure execution time
                t0 = time.time()
                y_pred = model.predict(X_test)
                print(f'Model with {i} estimators:')
                labels = ('corrosion', 'no corrosion')
                cm = confusion_matrix(y_test, y_pred, labels=labels)
                print(f'Confusion matrix for {i} estimators: \n {cm}')

                # Calculate metrics
                accuracy = accuracy_score(y_test, y_pred)
                accuracies.append(accuracy)
                recall = recall_score(y_test, y_pred, average='binary', pos_label='corrosion')
                recall = round(recall, 3)
                print(f'Recall for {i} estimators: {recall}')
                recalls.append(recall)
                specificity = recall_score(y_test, y_pred, average='binary', pos_label='no corrosion')
                specificity = round(specificity, 3)
                specificities.append(specificity)
                print(f'Specificity: {specificity}')
                precision = precision_score(y_test, y_pred, average='binary', pos_label='corrosion')
                precision = round(precision, 3)
                precisions.append(precision)
                f1 = f1_score(y_test, y_pred, average='binary', pos_label='corrosion')
                f1 = round(f1, 3)
                f1s.append(f1)
                t1 = time.time()
                time_taken = t1 - t0
                time_taken = round(time_taken, 3)
                times.append(time_taken)
                print(f'Execution time: {time_taken} seconds')
                print('\n')

            # Calculate mean metrics across folds
            recall_mean = np.mean(recalls)
            specificity_mean = np.mean(specificities)
            precision_mean = np.mean(precisions)
            f1_mean = np.mean(f1s)
            accuracy_mean = np.mean(accuracies)
            time_mean = np.mean(times)

            # Append results to DataFrame
            result_i = {'Model': 'Random Forest', 'Accuracy': accuracy_mean, 'N_estimators': i, 'Criterion': k, 'Min_samples_leaf': d,
                        'Recall': recall_mean, 'Specificity': specificity_mean, 'Precision': precision_mean, 'F1': f1_mean,
                        'Time': time_mean}
            df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
df_resultados.sort_values(by='Recall', ascending=False)

Chosen model

In [None]:
# Parameters
min_samples = 1
n_estimators = 50
criterion = 'entropy'

# Assuming data_train and data_test are already defined
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']
X_test = data_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_test = data_test['label_binary']

# Initialize and train the model
model = RandomForestClassifier(
    n_estimators=n_estimators,
    criterion=criterion,
    min_samples_leaf=min_samples,
    bootstrap=True,
    random_state=42
)
model.fit(X_train, y_train)

# Measure execution time
t0 = time.time()
y_pred = model.predict(X_test)
t1 = time.time()

# Metrics calculation
labels = ('corrosion', 'no corrosion')
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(f'Model with {n_estimators} estimators:')
print(f'Confusion matrix for {n_estimators} estimators: \n {cm}')

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy for {n_estimators} estimators: {accuracy}')
recall = recall_score(y_test, y_pred, average='binary', pos_label='corrosion')
recall = round(recall, 3)
print(f'Recall for {n_estimators} estimators: {recall}')
specificity = recall_score(y_test, y_pred, average='binary', pos_label='no corrosion')
specificity = round(specificity, 3)
print(f'Specificity: {specificity}')
precision = precision_score(y_test, y_pred, average='binary', pos_label='corrosion')
precision = round(precision, 3)
print(f'Precision for {n_estimators} estimators: {precision}')
f1 = f1_score(y_test, y_pred, average='binary', pos_label='corrosion')
f1 = round(f1, 3)
print(f'F1 for {n_estimators} estimators: {f1}')
time_taken = t1 - t0
time_taken = round(time_taken, 3)
print(f'Execution time: {time_taken} seconds')

# Create a DataFrame to store results
columns = ['Model', 'N_estimators', 'Criterion', 'Min_samples_leaf',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Append results to DataFrame
result_rf = {'Model': 'Random Forest', 'Accuracy': accuracy, 'N_estimators': n_estimators, 'Criterion': criterion, 'Min_samples_leaf': min_samples,
             'Recall': recall, 'Specificity': specificity, 'Precision': precision, 'F1': f1,
             'Time': time_taken}
df_results = pd.concat([df_results, pd.DataFrame([result_rf])], ignore_index=True)


In [None]:
# Get feature importances from the trained model
importancias = model.feature_importances_

# Plot feature importances as a bar chart
plt.figure(figsize=(10, 6))
plt.bar(X_train.columns, importancias, color='skyblue')
plt.xticks(rotation=90, fontsize=10)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Importance', fontsize=12)
plt.title('Feature Importances from Random Forest', fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Create a DataFrame from feature importances
importancias_df = pd.DataFrame(importancias, index=X_train.columns, columns=['Importance'])

# Sort the DataFrame by importance in descending order
sorted_importances_df = importancias_df.sort_values(by='Importance', ascending=False)
print(sorted_importances_df)


## XGBOOST

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'N_estimators', 'Learning_rate', 'Min_child_weight',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Define parameters for Grid Search
n_estimators = [25, 50, 75, 100, 125, 150]
metric = ['logloss']
learning_rates = [0.01, 0.05, 0.1, 0.5]
min_child_weight = [1, 2, 4, 8, 16, 32]

for i in n_estimators:
    for d in learning_rates:
        for k in min_child_weight:
            for m in metric:
                recalls = []
                specificities = []
                precisions = []
                f1s = []
                accuracies = []
                times = []
                for j in range(5):
                    d_test = pd.concat([D[j]])
                    y_test = d_test['label_binary']
                    X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
                    d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
                    y_train = d_train['label_binary']
                    X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
                    mapping = {'no corrosion': 0, 'corrosion': 1}
                    y_train = y_train.map(mapping)
                    y_test = y_test.map(mapping)

                    # Initialize and train the XGBoost model
                    model = xgb(
                        objective='binary:logistic',  # Binary classification
                        n_estimators=i,             # Number of trees (boosting rounds)
                        seed=42,                       # For reproducibility
                        learning_rate=d,
                        min_child_weight=k,
                        eval_metric=m
                    )
                    model.fit(X_train, y_train)

                    # Measure execution time
                    t0 = time.time()
                    y_pred = model.predict(X_test)
                    print(f'Model with {i} estimators, {d} learning rate, and {k} min child weight:')
                    labels = (1, 0)
                    cm = confusion_matrix(y_test, y_pred, labels=labels)
                    print(f'Confusion matrix for {i} estimators: \n {cm}')

                    # Calculate metrics
                    accuracy = accuracy_score(y_test, y_pred)
                    accuracies.append(accuracy)
                    recall = recall_score(y_test, y_pred, average='binary')
                    recall = round(recall, 3)
                    print(f'Recall for {i} estimators: {recall}')
                    recalls.append(recall)
                    specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
                    specificity = round(specificity, 3)
                    specificities.append(specificity)
                    print(f'Specificity: {specificity}')
                    precision = precision_score(y_test, y_pred, average='binary')
                    precision = round(precision, 3)
                    precisions.append(precision)
                    f1 = f1_score(y_test, y_pred, average='binary')
                    f1 = round(f1, 3)
                    f1s.append(f1)
                    t1 = time.time()
                    time_taken = t1 - t0
                    time_taken = round(time_taken, 3)
                    times.append(time_taken)
                    print(f'Execution time: {time_taken} seconds')
                    print('\n')

                # Calculate mean metrics across folds
                recall_mean = np.mean(recalls)
                specificity_mean = np.mean(specificities)
                precision_mean = np.mean(precisions)
                f1_mean = np.mean(f1s)
                accuracy_mean = np.mean(accuracies)
                time_mean = np.mean(times)

                # Append results to DataFrame
                result_i = {'Model': 'XGBoost', 'Accuracy': accuracy_mean, 'N_estimators': i, 'Learning_rate': d, 'Min_child_weight': k,
                            'Recall': recall_mean, 'Specificity': specificity_mean,
                            'Precision': precision_mean, 'F1': f1_mean,
                            'Time': time_mean}
                df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
df_results.sort_values(by='Recall', ascending=False)

Chosen model

In [None]:
# Parameters
n_estimators = 75
learning_rate = 0.01
min_child_weight = 32

# Initialize and train the XGBoost model
model = xgb(
    objective='binary:logistic',  # Binary classification
    n_estimators=n_estimators,     # Number of trees (boosting rounds)
    seed=42,                       # For reproducibility
    learning_rate=learning_rate,
    min_child_weight=min_child_weight,
    eval_metric='logloss'
)

# Prepare data
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']
X_test = data_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_test = data_test['label_binary']

# Map labels to numeric values
label_mapping = {'no corrosion': 0, 'corrosion': 1}
y_train = y_train.map(label_mapping)
y_test = y_test.map(label_mapping)

# Train the model
model.fit(X_train, y_train)

# Measure execution time
t0 = time.time()
y_pred = model.predict(X_test)
t1 = time.time()

# Print model details and metrics
print(f'Model with {n_estimators} estimators, {learning_rate} learning rate, and {min_child_weight} min child weight:')
labels = (1, 0)
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(f'Confusion matrix for {n_estimators} estimators: \n {cm}')

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy for {n_estimators} estimators: {accuracy}')
recall = recall_score(y_test, y_pred, average='binary')
recall = round(recall, 3)
print(f'Recall for {n_estimators} estimators: {recall}')
specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
specificity = round(specificity, 3)
print(f'Specificity: {specificity}')
precision = precision_score(y_test, y_pred, average='binary')
precision = round(precision, 3)
print(f'Precision for {n_estimators} estimators: {precision}')
f1 = f1_score(y_test, y_pred, average='binary')
f1 = round(f1, 3)
print(f'F1 for {n_estimators} estimators: {f1}')
time_taken = t1 - t0
time_taken = round(time_taken, 3)
print(f'Execution time: {time_taken} seconds')

# Create a DataFrame to store results
columns = ['Model', 'N_estimators', 'Learning_rate', 'Min_child_weight', 'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Append results to DataFrame
result_xgb = {'Model': 'XGBoost', 'Accuracy': accuracy, 'N_estimators': n_estimators, 'Learning_rate': learning_rate, 'Min_child_weight': min_child_weight,
              'Recall': recall, 'Specificity': specificity, 'Precision': precision, 'F1': f1,
              'Time': time_taken}
df_results = pd.concat([df_results, pd.DataFrame([result_xgb])], ignore_index=True)
print(df_results)


## LIGHTGBM GBDT

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'N_estimators', 'Learning_rate', 'N_bins', 'Num_leaves', 'Accuracy',
           'Metric',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Define parameters for Grid Search
n_estimators = [25, 50, 75, 100, 125, 150]
metric = ['logloss']
learning_rates = [0.025, 0.05, 0.1, 0.2, 0.4]
n_bins = [255]
num_leaves = [10, 30, 50]

for i in n_estimators:
    for d in learning_rates:
        for k in n_bins:
            for l in num_leaves:
                for m in metric:
                    recalls = []
                    specificities = []
                    precisions = []
                    f1s = []
                    accuracies = []
                    times = []
                    for j in range(5):
                        # Prepare test and train sets for this fold
                        d_test = pd.concat([D[j]])
                        y_test = d_test['label_binary']
                        X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
                        d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
                        y_train = d_train['label_binary']
                        X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
                        label_mapping = {'no corrosion': 0, 'corrosion': 1}
                        y_train = y_train.map(label_mapping)
                        y_test = y_test.map(label_mapping)

                        # Initialize and train the LightGBM model
                        model = lgbm(
                            objective='binary',  # Binary classification
                            boosting_type='gbdt',  # Boosting type
                            n_estimators=i,             # Number of trees (boosting rounds)
                            seed=42,                       # For reproducibility
                            learning_rate=d,
                            n_bins=k,
                            num_leaves=l,
                            eval_metric=m,
                            verbose=-1
                        )
                        model.fit(X_train, y_train)

                        # Measure execution time
                        t0 = time.time()
                        y_pred = model.predict(X_test)
                        print(f'Model with {i} estimators, {d} learning rate, and {l} leaves:')
                        labels = (1, 0)
                        cm = confusion_matrix(y_test, y_pred, labels=labels)
                        print(f'Confusion matrix for {i} estimators: \n {cm}')

                        # Calculate metrics
                        accuracy = accuracy_score(y_test, y_pred)
                        accuracies.append(accuracy)
                        recall = recall_score(y_test, y_pred, average='binary')
                        recall = round(recall, 3)
                        print(f'Recall for {i} estimators: {recall}')
                        recalls.append(recall)
                        specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
                        specificity = round(specificity, 3)
                        specificities.append(specificity)
                        print(f'Specificity: {specificity}')
                        precision = precision_score(y_test, y_pred, average='binary')
                        precision = round(precision, 3)
                        precisions.append(precision)
                        f1 = f1_score(y_test, y_pred, average='binary')
                        f1 = round(f1, 3)
                        f1s.append(f1)
                        t1 = time.time()
                        time_taken = t1 - t0
                        time_taken = round(time_taken, 3)
                        times.append(time_taken)
                        print(f'Execution time: {time_taken} seconds')
                        print('\n')

                    # Calculate mean metrics across folds
                    recall_mean = np.mean(recalls)
                    specificity_mean = np.mean(specificities)
                    precision_mean = np.mean(precisions)
                    f1_mean = np.mean(f1s)
                    accuracy_mean = np.mean(accuracies)
                    time_mean = np.mean(times)

                    # Append results to DataFrame
                    result_i = {'Model': 'LGBM', 'Accuracy': accuracy_mean, 'N_estimators': i, 'Learning_rate': d, 'N_bins': k, 'Num_leaves': l,
                                'Metric': m,
                                'Recall': recall_mean, 'Specificity': specificity_mean,
                                'Precision': precision_mean, 'F1': f1_mean,
                                'Time': time_mean}
                    df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
df_resultados.sort_values(by='Recall', ascending=False)

Chosen model

In [None]:
# Parameters
n_estimators = 25
learning_rate = 0.025
n_bins = 255
num_leaves = 10

# Prepare data
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']
label_mapping = {'no corrosion': 0, 'corrosion': 1}
y_train = y_train.map(label_mapping)
X_test = data_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_test = data_test['label_binary']
y_test = y_test.map(label_mapping)

# Initialize and train the LightGBM model
model = lgbm(
    objective='binary',  # Binary classification
    boosting_type='gbdt',  # Boosting type
    n_estimators=n_estimators,             # Number of trees (boosting rounds)
    seed=42,                       # For reproducibility
    learning_rate=learning_rate,
    eval_metric='logloss',
    verbose=-1,
    n_bins=n_bins,
    num_leaves=num_leaves
)
model.fit(X_train, y_train)

# Measure execution time
t0 = time.time()
y_pred = model.predict(X_test)
t1 = time.time()
time_taken = t1 - t0
time_taken = round(time_taken, 3)

# Print model details and metrics
print(f'Model with {n_estimators} estimators:')
labels = (1, 0)
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(f'Confusion matrix for estimators: \n {cm}')
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='binary')
recall = round(recall, 3)
print(f'Recall for estimators: {recall}')
print(f'Execution time: {time_taken} seconds')
precision = precision_score(y_test, y_pred, average='binary')
precision = round(precision, 3)
specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
specificity = round(specificity, 3)
f1 = f1_score(y_test, y_pred, average='binary')
f1 = round(f1, 3)
print(f'Specificity: {specificity}')
print(f'Precision: {precision}')
print(f'F1: {f1}')

# Create a DataFrame to store results
columns = ['Model', 'N_estimators', 'Accuracy', 'Learning_rate', 'N_bins', 'Num_leaves',
           'Metric',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Append results to DataFrame
result_i = {'Model': 'LightGBM', 'N_estimators': n_estimators, 'Learning_rate': learning_rate, 'N_bins': n_bins, 'Num_leaves': num_leaves,
            'Metric': 'logloss', 'Accuracy': accuracy,
            'Recall': recall, 'Specificity': specificity,
            'Precision': precision, 'F1': f1,
            'Time': time_taken}
df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)

print(df_results)


## LIGHTGBM GOSS

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'Top_rate', 'Other_rate', 'Learning_rate', 'Num_leaves',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Define parameters for Grid Search
top_rate = [0.2, 0.4, 0.6]
other_rate = [0.05, 0.1, 0.3]  # Corrected to be a list
learning_rates = [0.025, 0.05, 0.1, 0.2]
num_leaves = [10, 30, 50]

for i in top_rate:
    for d in learning_rates:
        for k in other_rate:
            for l in num_leaves:
                recalls = []
                specificities = []
                precisions = []
                f1s = []
                accuracies = []
                times = []
                for j in range(5):
                    # Prepare test and train sets for this fold
                    d_test = pd.concat([D[j]])
                    y_test = d_test['label_binary']
                    X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
                    d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
                    y_train = d_train['label_binary']
                    X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
                    label_mapping = {'no corrosion': 0, 'corrosion': 1}
                    y_train = y_train.map(label_mapping)
                    y_test = y_test.map(label_mapping)

                    # Initialize and train the LightGBM model
                    model = lgbm(
                        objective='binary',  # Binary classification
                        boosting_type='goss',  # Boosting type
                        top_rate=i,             # Top rate
                        seed=42,                       # For reproducibility
                        learning_rate=d,
                        other_rate=k,
                        num_leaves=l,
                        verbose=-1
                    )
                    model.fit(X_train, y_train)

                    # Measure execution time
                    t0 = time.time()
                    y_pred = model.predict(X_test)
                    print(f'Model with {i} top rate, {d} learning rate, and {l} leaves:')
                    labels = (1, 0)
                    cm = confusion_matrix(y_test, y_pred, labels=labels)
                    print(f'Confusion matrix for {i} top rate: \n {cm}')

                    # Calculate metrics
                    accuracy = accuracy_score(y_test, y_pred)
                    accuracies.append(accuracy)
                    recall = recall_score(y_test, y_pred, average='binary')
                    recall = round(recall, 3)
                    print(f'Recall for {i} top rate: {recall}')
                    recalls.append(recall)
                    specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
                    specificity = round(specificity, 3)
                    specificities.append(specificity)
                    print(f'Specificity: {specificity}')
                    precision = precision_score(y_test, y_pred, average='binary')
                    precision = round(precision, 3)
                    precisions.append(precision)
                    f1 = f1_score(y_test, y_pred, average='binary')
                    f1 = round(f1, 3)
                    f1s.append(f1)
                    t1 = time.time()
                    time_taken = t1 - t0
                    time_taken = round(time_taken, 3)
                    times.append(time_taken)
                    print(f'Execution time: {time_taken} seconds')
                    print('\n')

                # Calculate mean metrics across folds
                recall_mean = np.mean(recalls)
                specificity_mean = np.mean(specificities)
                precision_mean = np.mean(precisions)
                f1_mean = np.mean(f1s)
                accuracy_mean = np.mean(accuracies)
                time_mean = np.mean(times)

                # Append results to DataFrame
                result_i = {'Model': 'LightGBM', 'Accuracy': accuracy_mean, 'Top_rate': i, 'Learning_rate': d, 'Other_rate': k, 'Num_leaves': l,
                            'Recall': recall_mean, 'Specificity': specificity_mean,
                            'Precision': precision_mean, 'F1': f1_mean,
                            'Time': time_mean}
                df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
df_resultados.sort_values(by='Recall', ascending=False)

Chosen model

In [None]:
# Model parameters
top_rate = 0.2
learning_rate = 0.05
other_rate = 0.05
num_leaves = 10

# Prepare training and testing data
X_train = data_train.drop(columns=['label_binary', 'label_multi', 'n_image'])
y_train = data_train['label_binary']
label_mapping = {'no corrosion': 0, 'corrosion': 1}
y_train = y_train.map(label_mapping)

X_test = data_test.drop(columns=['label_binary', 'label_multi', 'n_image'])
y_test = data_test['label_binary']
y_test = y_test.map(label_mapping)

# Initialize and train the LightGBM model
model = lgbm(
    objective='binary',  # Binary classification
    boosting_type='goss',  # Boosting type
    top_rate=top_rate,             # Top rate
    seed=42,                       # For reproducibility
    learning_rate=learning_rate,
    other_rate=other_rate,
    num_leaves=num_leaves,
    verbose=-1
)
model.fit(X_train, y_train)

# Measure execution time
t0 = time.time()
y_pred = model.predict(X_test)
t1 = time.time()

# Print model details and metrics
print(f'Model with {num_leaves} leaves, {learning_rate} learning rate, {top_rate} top rate, and {other_rate} other rate:')
labels = (1, 0)
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(f'Confusion matrix: \n {cm}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
recall = recall_score(y_test, y_pred, average='binary')
recall = round(recall, 3)
print(f'Recall: {recall}')
specificity = recall_score(y_test, y_pred, average='binary', pos_label=0)
specificity = round(specificity, 3)
print(f'Specificity: {specificity}')
precision = precision_score(y_test, y_pred, average='binary')
precision = round(precision, 3)
print(f'Precision: {precision}')
f1 = f1_score(y_test, y_pred, average='binary')
f1 = round(f1, 3)
print(f'F1: {f1}')
time_taken = t1 - t0
time_taken = round(time_taken, 3)
print(f'Execution time: {time_taken} seconds')
print('\n')

# Create a DataFrame to store results
columns = ['Model', 'Learning_rate', 'Num_leaves', 'Top_rate', 'Other_rate',
           'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# Append results to DataFrame
result_lgbm = {'Model': 'LightGBM', 'Accuracy': accuracy, 'Top_rate': top_rate, 'Other_rate': other_rate, 'Learning_rate': learning_rate, 'Num_leaves': num_leaves,
               'Recall': recall, 'Specificity': specificity, 'Precision': precision, 'F1': f1,
               'Time': time_taken}
df_results = pd.concat([df_results, pd.DataFrame([result_lgbm])], ignore_index=True)
print(df_results)


## KNN

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'N_neighbors',  'Distance', 'Accuracy',
           'Recall',
           'Specificity',
           'Precision',
           'F1',
           'Time']
df_results = pd.DataFrame(columns=columns)

# List of neighbors to test
N_neighbors = [1, 3, 5, 11, 21, 41, 61]

# List of distance metrics to use
distance_metrics = ['euclidean', 'manhattan']

# Iterate over each number of neighbors and distance metric
for i in N_neighbors:
    for d in distance_metrics:
        # Lists to store results across multiple iterations
        recalls = []
        specificities = []
        precisions = []
        f1s = []
        accuracies = []
        times = []

        # Perform cross-validation by iterating over each fold
        for j in range(5):

            # Select the test dataset for this iteration
            d_test = pd.concat([D[j]])
            y_test = d_test['label_binary']
            X_test = d_test.drop(columns=['label_binary', 'n_image', 'label_multi'])

            # Select the training datasets for this iteration
            d_train = pd.concat([D[k] for k in range(5) if k != j], ignore_index=True)
            y_train = d_train['label_binary']
            X_train = d_train.drop(columns=['label_binary', 'n_image', 'label_multi'])

            # Scale the data using StandardScaler
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # Initialize and fit the KNeighborsClassifier model
            model = KNeighborsClassifier(n_neighbors=i, metric=d)
            model.fit(X_train, y_train)

            # Measure the time taken to make predictions
            t0 = time.time()
            y_pred = model.predict(X_test)
            t1 = time.time()

            # Print model details
            print(f'Model with {i} neighbors using {d} distance:')
            labels = ('corrosion', 'no corrosion')
            cm = confusion_matrix(y_test, y_pred, labels=labels)
            print(f'Confusion matrix for {i} neighbors:\n {cm}')

            # Calculate and store accuracy
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

            # Calculate and store recall
            recall = recall_score(y_test, y_pred, average='binary', pos_label='corrosion')
            recall = round(recall, 3)
            print(f'Recall for {i} neighbors: {recall}')
            recalls.append(recall)

            # Calculate and store specificity
            specificity = recall_score(y_test, y_pred, average='binary', pos_label='no corrosion')
            specificity = round(specificity, 3)
            print(f'Specificity: {specificity}')
            specificities.append(specificity)

            # Calculate and store precision
            precision = precision_score(y_test, y_pred, average='binary', pos_label='corrosion')
            precision = round(precision, 3)
            precisions.append(precision)

            # Calculate and store F1 score
            f1 = f1_score(y_test, y_pred, average='binary', pos_label='corrosion')
            f1 = round(f1, 3)
            f1s.append(f1)

            # Calculate and store time taken
            time_taken = t1 - t0
            time_taken = round(time_taken, 3)
            times.append(time_taken)
            print(f'Execution time: {time_taken} seconds')
            print('\n')

        # Calculate mean metrics across all iterations
        recall_mean = np.mean(recalls)
        specificity_mean = np.mean(specificities)
        precision_mean = np.mean(precisions)
        f1_mean = np.mean(f1s)
        accuracy_mean = np.mean(accuracies)
        time_mean = np.mean(times)

        # Create a dictionary to store the results for this model configuration
        result_i = {
            'Model': 'KNN',
            'Accuracy': accuracy_mean,
            'N_neighbors': i,
            'Distance': d,
            'Recall': recall_mean,
            'Specificity': specificity_mean,
            'Precision': precision_mean,
            'F1': f1_mean,
            'Time': time_mean
        }

        # Append the results to the DataFrame
        df_results = pd.concat([df_results, pd.DataFrame([result_i])], ignore_index=True)


In [None]:
K = 61                        # Number of neighbors
DISTANCE_METRIC = 'manhattan' # Distance metric for KNN
LABELS = ('corrosion', 'no corrosion')  # Class labels

# ========================
# Data Preparation
# ========================
# Prepare training data
X_train = data_train.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_train = data_train['label_binary']

# Prepare test data
X_test = data_test.drop(columns=['label_binary', 'n_image', 'label_multi'])
y_test = data_test['label_binary']

# ========================
# Feature Scaling
# ========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================
# Model Training
# ========================
model = KNeighborsClassifier(
    n_neighbors=K,
    metric=DISTANCE_METRIC
)
model.fit(X_train_scaled, y_train)

# ========================
# Model Evaluation
# ========================
# Time prediction process
start_time = time.time()
y_pred = model.predict(X_test_scaled)
execution_time = round(time.time() - start_time, 3)

# Calculate evaluation metrics
cm = confusion_matrix(y_test, y_pred, labels=LABELS)
accuracy = accuracy_score(y_test, y_pred)
recall = round(recall_score(y_test, y_pred, average='binary', pos_label='corrosion'), 3)
specificity = round(recall_score(y_test, y_pred, average='binary', pos_label='no corrosion'), 3)
precision = round(precision_score(y_test, y_pred, average='binary', pos_label='corrosion'), 3)
f1 = round(f1_score(y_test, y_pred, average='binary', pos_label='corrosion'), 3)

# ========================
# Results Display
# ========================
print(f'\nModel with {K} neighbors using {DISTANCE_METRIC} distance:')
print(f'Confusion Matrix:\n{cm}')
print(f'Accuracy: {accuracy:.3f}')
print(f'Recall (Corrosion Detection): {recall}')
print(f'Specificity (No Corrosion): {specificity}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')
print(f'Execution Time: {execution_time} seconds')

# ========================
# Results Storage
# ========================
results_columns = ['Model', 'N_neighbors', 'Distance', 'Accuracy',
                  'Recall', 'Specificity', 'Precision', 'F1', 'Time']

results_data = {
    'Model': 'KNN',
    'N_neighbors': K,
    'Distance': DISTANCE_METRIC,
    'Accuracy': accuracy,
    'Recall': recall,
    'Specificity': specificity,
    'Precision': precision,
    'F1': f1,
    'Time': execution_time
}

df_final_results = pd.DataFrame([results_data])
print('\nFinal Results DataFrame:')
print(df_final_results)

## Logistic regression

In [None]:
# Define results DataFrame structure
results_columns = ['Model', 'Accuracy', 'Recall', 'Specificity',
                  'Precision', 'F1', 'Time']
df_results = pd.DataFrame(columns=results_columns)

# Configuration parameters
VIF_THRESHOLD = 10
PVAL_THRESHOLD = 0.01
MAX_ITERATIONS = 100

def perform_feature_selection(X_train, y_train):
    """
    Perform iterative feature selection using p-values and VIF analysis.

    Args:
        X_train: Training features DataFrame
        y_train: Training labels Series

    Returns:
        Tuple: (Final model, List of selected features)
    """
    current_features = X_train.columns.tolist()
    removed_features = []

    # First phase: Remove features with high p-values
    while True:
        model = sm.Logit(y_train, X_train[current_features]).fit(
            disp=0,
            maxiter=MAX_ITERATIONS
        )

        # Get highest p-value feature
        p_values = model.pvalues
        max_p_feature = p_values.idxmax()
        max_p_value = p_values.max()

        if max_p_value < PVAL_THRESHOLD:
            break

        print(f"Removing feature: {max_p_feature} (p-value: {max_p_value:.3f})")
        current_features.remove(max_p_feature)

    # Second phase: Remove features with high multicollinearity
    while True:
        # Calculate VIF for remaining features
        vif_data = pd.DataFrame()
        vif_data['Feature'] = current_features
        vif_data['VIF'] = [variance_inflation_factor(X_train[current_features].values, i)
                          for i in range(len(current_features))]

        max_vif = vif_data['VIF'].max()
        max_vif_feature = vif_data.loc[vif_data['VIF'].idxmax(), 'Feature']

        if max_vif < VIF_THRESHOLD:
            print("All VIF values below threshold")
            break

        print(f"Removing feature: {max_vif_feature} (VIF: {max_vif:.1f})")
        current_features.remove(max_vif_feature)

    return model, current_features

# Main cross-validation loop
for fold_idx in tqdm(range(5), desc="Processing folds"):
    # Split data into train/test sets
    test_data = pd.concat([D[fold_idx]])
    train_data = pd.concat([d for i, d in enumerate(D) if i != fold_idx])

    # Prepare datasets
    y_test = test_data['label_binary']
    X_test = test_data.drop(columns=['label_binary', 'n_image', 'label_multi'])

    y_train = train_data['label_binary']
    X_train = train_data.drop(columns=['label_binary', 'n_image', 'label_multi'])

    # Encode labels to numerical values
    label_mapping = {'no corrosion': 0, 'corrosion': 1}
    y_train = y_train.map(label_mapping)
    y_test = y_test.map(label_mapping)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert back to DataFrames with original column names
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    # Feature selection process
    final_model, selected_features = perform_feature_selection(X_train_scaled, y_train)

    # Make predictions
    start_time = time.time()
    X_test_final = X_test_scaled[selected_features]
    y_pred_proba = final_model.predict(X_test_final)
    y_pred = (y_pred_proba >= 0.5).astype(int)
    elapsed_time = time.time() - start_time

    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred, labels=[1, 0])
    print(f"\nConfusion matrix for fold {fold_idx+1}:\n{cm}")

    fold_metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred, pos_label=1),
        'specificity': recall_score(y_test, y_pred, pos_label=0),
        'precision': precision_score(y_test, y_pred, pos_label=1),
        'f1': f1_score(y_test, y_pred, pos_label=1),
        'time': elapsed_time
    }

    # Store metrics
    df_results = pd.concat([
        df_results,
        pd.DataFrame([{
            'Model': 'Logistic Regression',
            'Accuracy': fold_metrics['accuracy'],
            'Recall': fold_metrics['recall'],
            'Specificity': fold_metrics['specificity'],
            'Precision': fold_metrics['precision'],
            'F1': fold_metrics['f1'],
            'Time': fold_metrics['time']
        }])
    ], ignore_index=True)

# Calculate mean metrics across all folds
mean_results = {
    'Model': 'Logistic Regression',
    'Accuracy': df_results['Accuracy'].mean(),
    'Recall': df_results['Recall'].mean(),
    'Specificity': df_results['Specificity'].mean(),
    'Precision': df_results['Precision'].mean(),
    'F1': df_results['F1'].mean(),
    'Time': df_results['Time'].mean()
}

# Add mean results to DataFrame
df_results = pd.concat([
    df_results,
    pd.DataFrame([mean_results])
], ignore_index=True)

print("\nFinal Results:")
print(df_results.round(3))

## SVM

In [None]:
# Define the columns for the results DataFrame
columns = ['Model', 'C', 'Kernel', 'Accuracy', 'Recall', 'Specificity', 'Precision', 'F1', 'Time']
df_results = pd.DataFrame(columns=columns)

# Hyperparameter values for SVM
C_values = [0.001, 0.01, 0.1, 1]  # Regularization parameter values
kernels = ['linear', 'rbf']  # Kernel types to test

# Iterate over each combination of C and kernel
for c in C_values:
    for kernel in kernels:
        # Initialize lists to store metrics across folds
        recalls = []
        specificities = []
        precisions = []
        f1s = []
        accuracies = []
        times = []

        # Perform 5-fold cross-validation
        for fold in range(5):
            # Split data into training and testing sets for this fold
            test_data = pd.concat([D[fold]])
            y_test = test_data['label_binary']
            X_test = test_data.drop(columns=['label_binary', 'n_image', 'label_multi'])

            train_data = pd.concat([D[i] for i in range(5) if i != fold], ignore_index=True)
            y_train = train_data['label_binary']
            X_train = train_data.drop(columns=['label_binary', 'n_image', 'label_multi'])

            # Scale the features using StandardScaler
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Train the SVM model with the current hyperparameters
            model = SVC(C=c, kernel=kernel, verbose=False)
            model.fit(X_train_scaled, y_train)

            # Measure prediction time
            t0 = time.time()
            y_pred = model.predict(X_test_scaled)
            t1 = time.time()

            # Print model details and confusion matrix
            print(f'Model with C={c} and kernel={kernel}:')
            labels = ('corrosion', 'no corrosion')
            cm = confusion_matrix(y_test, y_pred, labels=labels)
            print(f'Confusion matrix for C={c}:\n{cm}')

            # Calculate metrics and store them
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

            recall = recall_score(y_test, y_pred, average='binary', pos_label='corrosion')
            recall = round(recall, 3)
            print(f'Recall for C={c}: {recall}')
            recalls.append(recall)

            specificity = recall_score(y_test, y_pred, average='binary', pos_label='no corrosion')
            specificity = round(specificity, 3)
            specificities.append(specificity)
            print(f'Specificity: {specificity}')

            precision = precision_score(y_test, y_pred, average='binary', pos_label='corrosion')
            precision = round(precision, 3)
            precisions.append(precision)

            f1 = f1_score(y_test, y_pred, average='binary', pos_label='corrosion')
            f1 = round(f1, 3)
            f1s.append(f1)

            time_taken = t1 - t0
            time_taken = round(time_taken, 3)
            times.append(time_taken)
            print(f'Execution time: {time_taken} seconds\n')

        # Calculate mean metrics across all folds
        mean_recall = np.mean(recalls)
        mean_specificity = np.mean(specificities)
        mean_precision = np.mean(precisions)
        mean_f1 = np.mean(f1s)
        mean_accuracy = np.mean(accuracies)
        mean_time = np.mean(times)

        # Create a dictionary to store the results for this configuration
        result_row = {
            'Model': 'SVM',
            'C': c,
            'Kernel': kernel,
            'Accuracy': mean_accuracy,
            'Recall': mean_recall,
            'Specificity': mean_specificity,
            'Precision': mean_precision,
            'F1': mean_f1,
            'Time': mean_time
        }

        # Append the results to the DataFrame
        df_results = pd.concat([df_results, pd.DataFrame([result_row])], ignore_index=True)

# Display final results summary
print("\nFinal Results:")
print(df_results.round(3))

In [None]:
C_VALUE = 0.1                # Regularization parameter
KERNEL_TYPE = 'rbf'          # Kernel type for SVM
LABEL_MAPPING = {            # Label encoding dictionary
    'no corrosion': 0,
    'corrosion': 1
}

# ========================
# Data Preparation
# ========================
# Separate features and labels for training data
X_train = data_train.drop(columns=['label_binary', 'label_multi', 'n_image'])
y_train = data_train['label_binary']

# Separate features and labels for test data
X_test = data_test.drop(columns=['label_binary', 'label_multi', 'n_image'])
y_test = data_test['label_binary']

# ========================
# Feature Scaling
# ========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ========================
# Label Encoding
# ========================
y_train_encoded = y_train.map(LABEL_MAPPING)
y_test_encoded = y_test.map(LABEL_MAPPING)

# ========================
# Model Training
# ========================
model = SVC(C=C_VALUE, kernel=KERNEL_TYPE, verbose=True)
model.fit(X_train_scaled, y_train_encoded)

# ========================
# Model Evaluation
# ========================
# Time prediction only
start_time = time.time()
y_pred = model.predict(X_test_scaled)
execution_time = round(time.time() - start_time, 3)

# Calculate evaluation metrics
cm = confusion_matrix(y_test_encoded, y_pred, labels=[1, 0])
accuracy = accuracy_score(y_test_encoded, y_pred)
recall = round(recall_score(y_test_encoded, y_pred, pos_label=1), 3)
specificity = round(recall_score(y_test_encoded, y_pred, pos_label=0), 3)
precision = round(precision_score(y_test_encoded, y_pred, pos_label=1), 3)
f1 = round(f1_score(y_test_encoded, y_pred, pos_label=1), 3)

# ========================
# Results Display
# ========================
print(f'\nModel with C={C_VALUE} and {KERNEL_TYPE} kernel:')
print(f'Confusion Matrix:\n{cm}')
print(f'Accuracy: {accuracy:.3f}')
print(f'Recall (Corrosion): {recall}')
print(f'Specificity (No Corrosion): {specificity}')
print(f'Precision: {precision}')
print(f'F1 Score: {f1}')
print(f'Execution Time: {execution_time} seconds')

# ========================
# Results Storage
# ========================
results_columns = ['Model', 'C', 'Kernel', 'Accuracy',
                  'Recall', 'Specificity', 'Precision', 'F1', 'Time']
results_data = {
    'Model': 'SVM',
    'C': C_VALUE,
    'Kernel': KERNEL_TYPE,
    'Accuracy': accuracy,
    'Recall': recall,
    'Specificity': specificity,
    'Precision': precision,
    'F1': f1,
    'Time': execution_time
}

df_results = pd.DataFrame([results_data])
print('\nResults DataFrame:')
print(df_results)
