### Libraries

In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import time
import pandas as pd
import seaborn as sns
from scipy.stats import mstats
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from scipy import stats
import random



In [None]:
data=pd.read_csv('path_to_the_data')

### NaN analysis

In [None]:

#Columns with nans
columnas_con_nans=data.columns[data.isna().any()].tolist()
columnas_con_nans

In [None]:
# Number of NaNs
data.isna().sum().sum()>0

## Train-test division

In [None]:
# To select 20% of the images for testing and 80% for training
# Choose 88 numbers from 0 to 439

random.seed(42)  # Set the random seed for reproducibility
test_indices = random.sample(range(0, 440), 88)  # Generate a list of indices for testing
# Complementary set for training
train_indices = [i for i in range(0, 440) if i not in test_indices]  # Create a list of indices for training


In [None]:
data_test=data[data['n_imagen'].isin(test_indices)]
data_train=data[data['n_imagen'].isin(train_indices)]


In [None]:
data_train.shape

(360448, 88)

In [None]:
#Delete ID column
data_train.drop(columns=['ID'], inplace=True)
data_test.drop(columns=['ID'], inplace=True)

In [None]:
# Analysis of the target variable distribution
target_variable = data['etiqueta']  # Extract the target variable
features = data.drop(columns=['etiqueta', 'n_imagen', 'etiqueta_multi'])  # Remove unnecessary columns
plt.hist(target_variable)  # Plot a histogram of the target variable
plt.title('Distribution of Target Variable')  # Add a title to the histogram
plt.xlabel('Value')  # Label the x-axis
plt.ylabel('Frequency')  # Label the y-axis
plt.show()  # Display the plot


## Outlier detection: One Class SVM

In [None]:
data_train_outliers=data_train.drop(columns=['etiqueta', 'n_imagen', 'etiqueta_multi'])

In [None]:
# One-Class SVM for outlier detection

from sklearn.preprocessing import StandardScaler

# Parameters
nu = 0.01  # Fraction of outliers in the data
random.seed(42)  # Set the random seed for reproducibility

# Select relevant features (ensure you select numeric columns)
features = data_train_outliers.columns
df = data_train_outliers
data = df[features]

# Standardize the data (important for SVM)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Train the One-Class SVM model
model = OneClassSVM(kernel='rbf', gamma='scale', nu=nu, verbose=True)
# Adjust 'nu' based on the expected percentage of outliers
model.fit(data_scaled)

# Prediction (1: inliers, -1: outliers)
df['outlier'] = model.predict(data_scaled)

# Identify and save outliers
outliers = df[df['outlier'] == -1]
print(f"Total outliers detected: {len(outliers)}")
outliers.to_csv('detected_outliers.csv', index=False)

# Display some outliers
print(outliers.head())


In [None]:
# Calculate the percentage of outliers
percentage_outliers = (outliers.shape[0] / data_train.shape[0]) * 100
print(f"Percentage of outliers: {percentage_outliers:.2f}%")


In [None]:
# Identify the indices of outliers
outlier_indices = outliers.index

# Filter out outliers from the training data
data_train = data_train[~data_train.index.isin(outlier_indices)]


In [None]:
data_train.shape

## Correlation analysis

In [None]:
# Prepare training data
features = data_train.drop(columns=['etiqueta', 'n_imagen', 'etiqueta_multi'])
target = data_train['etiqueta']

# Calculate the correlation matrix
correlation_matrix = features.corr()

# Create a heatmap
def plot_correlation_heatmap(matrix):
    # Apply font settings in Matplotlib
    plt.rcParams["font.family"] = 'serif'
    plt.figure(figsize=(16, 14))  # Adjust the figure size
    sns.heatmap(
        matrix,
        annot=False,  # Hide numerical annotations
        cmap="coolwarm",  # Color palette
        cbar=True,      # Show color bar
        # Color bar range from -1 to 1
        vmin=-1,
        vmax=1,
        center=0,
        xticklabels=True,  # Show labels on the x-axis
        yticklabels=True,  # Show labels on the y-axis
        square=True      # Keep squares proportional
    )
    plt.show()

# Usage
plot_correlation_heatmap(correlation_matrix)


In [None]:
# Set the correlation threshold
threshold = 0.80

# Function to identify highly correlated variable pairs
def find_high_correlations(matrix, threshold):
    # Select pairs of variables with correlation above the threshold
    high_correlation = (matrix.where(matrix > threshold)
                        .stack()
                        .reset_index())

    # Filter duplicates (since correlation is symmetric) and remove self-correlations
    high_correlation = high_correlation[high_correlation['level_0'] != high_correlation['level_1']]

    # Rename columns for clarity
    high_correlation.columns = ['Variable1', 'Variable2', 'Correlation']

    # Remove inverse duplicates (mirror), e.g., (Variable1, Variable2) and (Variable2, Variable1)
    high_correlation = high_correlation.sort_values(by=['Variable1', 'Variable2']).drop_duplicates(subset=['Variable1', 'Variable2'])

    return high_correlation

# Usage
high_correlation = find_high_correlations(correlation_matrix, threshold)

# Display the result
for index, row in high_correlation.iterrows():
    print(f"Variables: {row['Variable1']} and {row['Variable2']}, Correlation: {row['Correlation']:.2f}")


In [None]:
# Initialize a list to store columns to remove
columns_to_remove = []

# Iterate over the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if correlation_matrix.iloc[i, j] > threshold:  # If correlation is above the threshold
            colname = correlation_matrix.columns[i]   # Name of the highly correlated column
            columns_to_remove.append(colname)            # Add to the list for removal

# Remove duplicates from the list
columns_to_remove = list(set(columns_to_remove))
print(f"Number of columns to remove: {len(columns_to_remove)}")


In [None]:
# Remove columns
data_train=data_train.drop(columns=columns_to_remove)
data_test=data_test.drop(columns=columns_to_remove)

In [None]:
# Identify constant columns
constant_columns = []
for column in data_train.columns:
    if data_train[column].nunique() == 1:
        constant_columns.append(column)
print(constant_columns)


In [None]:
# Remove constant columns
data_train=data_train.drop(columns=columnas_constantes)
data_test=data_test.drop(columns=columnas_constantes)

In [None]:
data_train.columns

In [None]:
data_train.shape

## Undersampling

In [None]:
# Count the number of samples with 'corrosion' in the 'etiqueta' column
t_corrosion = data_train[data_train['etiqueta'] == 'corrosion'].shape[0]

# Count the number of samples with 'no corrosion' in the 'etiqueta' column
t_no_corrosion = data_train[data_train['etiqueta'] == 'no corrosion'].shape[0]

# Calculate the difference to determine how many samples to remove
data_to_remove = t_no_corrosion - t_corrosion

print(f"Data to remove: {data_to_remove}")

# Randomly remove data to balance classes
random.seed(42)  # Set the random seed for reproducibility
indices_to_remove = data_train[data_train['etiqueta'] == 'no corrosion'].sample(n=data_to_remove).index
data_train = data_train.drop(indices_to_remove)
print(data_train.shape)


In [None]:
# Verify class balance
corrosion_count = data_train[data_train['etiqueta'] == 'corrosion'].shape[0]
no_corrosion_count = data_train[data_train['etiqueta'] == 'no corrosion'].shape[0]
print(f"Corrosion count: {corrosion_count}, No corrosion count: {no_corrosion_count}")


## Homogeneity test

In [None]:
# Function to classify features as homogeneous or non-homogeneous
def classify_features(data, features, alpha=0.01):
    homogeneous_features = []
    non_homogeneous_features = []

    for feature in features:
        no_corrosion_values = data[data['etiqueta'] == 'no corrosion'][feature]
        corrosion_values = data[data['etiqueta'] == 'corrosion'][feature]

        stat, p_value = stats.mannwhitneyu(no_corrosion_values, corrosion_values)

        if p_value < alpha:
            non_homogeneous_features.append(feature)
        else:
            homogeneous_features.append(feature)

    return homogeneous_features, non_homogeneous_features

# Usage
homogeneous_features, non_homogeneous_features = classify_features(data_train, X_train.columns)
print(f"Homogeneous features: {homogeneous_features}")
print(f"Non-homogeneous features: {non_homogeneous_features}")


In [None]:
# data_train.drop(columns=homogeneas, inplace=True)
# data_test.drop(columns=homogeneas, inplace=True)

## Data analysis

In [None]:

# Improve style with seaborn
sns.set(style="whitegrid")

# Create the figure and axis
plt.figure(figsize=(10, 6))

# Use seaborn function to enhance aesthetics
sns.histplot(red_std_yes, bins=30, color='gray', label='Corrosion', stat='density', alpha=0.7)
sns.histplot(red_std_no, bins=30, color='black', label='No Corrosion', stat='density', alpha=0.7)

# Enhance labels and titles
#plt.title('Distribution of Homogeneity GLCM 0º for Corrosion and No Corrosion', fontsize=14)
plt.xlabel('Red Standard Deviation', fontsize=12)
plt.ylabel('Density', fontsize=12)

# Add legend with a cleaner format
plt.legend(title='Category', fontsize=11, title_fontsize='13')

# Improve plot appearance
plt.grid(True, linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()
plt.show()



In [None]:
# Function to create a comparative histogram plot for multiple variables
def create_comparative_histograms(data, variables, descriptions):
    sns.set(style="whitegrid")
    fig, axes = plt.subplots((len(variables) + 2) // 3, 3, figsize=(18, 6 * ((len(variables) + 2) // 3)))
    axes = axes.flatten()

    for i, var in enumerate(variables):
        corrosion_data = data[data['etiqueta'] == 'corrosion']
        no_corrosion_data = data[data['etiqueta'] == 'no corrosion']

        var_corr = corrosion_data[var]
        var_no_corr = no_corrosion_data[var]

        sns.histplot(var_corr, bins=30, color='red', label='Corrosion', stat='density', alpha=0.6, ax=axes[i])
        sns.histplot(var_no_corr, bins=30, color='green', label='No Corrosion', stat='density', alpha=0.6, ax=axes[i])

        description = descriptions[var]
        axes[i].set_title(f'Distribution of "{description}"', fontsize=14)
        axes[i].set_xlabel(description, fontsize=12)
        axes[i].set_ylabel('Density', fontsize=12)
        axes[i].legend(title='Category', fontsize=11, title_fontsize='13')

    plt.tight_layout()
    plt.show()

# Usage
variables = ['red_mean', 'saturation_mean', 'glcm_contrast_0', 'lbp_features_1', 'hue_std', 'blue_energy']
descriptions = {
    'red_mean': 'Red Mean',
    'saturation_mean': 'Saturation Mean',
    'glcm_contrast_0': 'GLCM Contrast',
    'lbp_features_1': 'LBP',
    'hue_std': 'Hue Standard Deviation',
    'blue_energy': 'Blue Energy'
}

create_comparative_histograms(data_train, variables, descriptions)


In [None]:
# PCA Plot
# Scale the data
scaler = StandardScaler()
X_train = data_train.drop(columns=['etiqueta', 'n_imagen', 'etiqueta_multi'])
X_train_scaled = scaler.fit_transform(X_train)

# Apply PCA
pca = PCA(n_components=20)
X_train_pca = pca.fit_transform(X_train_scaled)

# Variance explained by each component
variance_per_component = pca.explained_variance_ratio_

# Plot configuration
sns.set_context("talk")  # Larger font size
sns.set_style("whitegrid")  # Style with light grid

# Create the figure and axis
plt.figure(figsize=(10, 6))

# Plot points
x = range(1, len(variance_per_component) + 1)
plt.scatter(x, variance_per_component, marker='o', color='blue', s=100, label='Explained Variance')

# Connect points with a line
plt.plot(x, variance_per_component, linestyle='-', color='blue', linewidth=2)

# Axis labels
plt.xlabel('Principal Components', fontsize=14, labelpad=10)
plt.ylabel('Explained Variance', fontsize=14, labelpad=10)

# Adjust limits and ticks
plt.ylim(0, max(variance_per_component) + 0.1)  # Leave some space above
plt.yticks(fontsize=12)

# Add legend
plt.legend(fontsize=12, loc='best')

# Improved grid
plt.grid(True, linestyle='--', alpha=0.7)

# Save the plot with publication quality
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
# Plot cumulative variance
# Calculate cumulative variance
cumulative_variance = np.cumsum(variance_per_component)

# Plot configuration
sns.set_context("talk")  # Larger font size
sns.set_style("whitegrid")  # Style with light grid

# Create the figure and axis
plt.figure(figsize=(10, 6))

# Plot points
x = range(1, len(cumulative_variance) + 1)
plt.scatter(x, cumulative_variance, marker='o', color='blue', s=100, label='Cumulative Explained Variance')

# Connect points with a line
plt.plot(x, cumulative_variance, linestyle='-', color='blue', linewidth=2)

# Axis labels
plt.xlabel('Principal Components', fontsize=14, labelpad=10)
plt.ylabel('Cumulative Explained Variance', fontsize=14, labelpad=10)

# Adjust limits and ticks
plt.ylim(0, 1)  # Leave some space above
plt.yticks(fontsize=12)

# Add legend
plt.legend(fontsize=12, loc='best')

# Improved grid
plt.grid(True, linestyle='--', alpha=0.7)

# Save the plot with publication quality
plt.tight_layout()

# Display the plot
plt.show()
