In [1]:
# Packages
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
import warnings




URV                                                                            MESIIA

Neural and Evolutionary Computation (NEC)

Assignment 2: Classification with SVM, BP and MLR

Teachers: Dr. Jordi Duch, Dr. Sergio Gomez

Student: Natzaret Gálvez Rísquez

Part 1: Selecting and analyzing the datasets

We will perform the classification in the following three datasets:

In [2]:
# We upload the datasets

# First dataset: File: A2-ring.txt
    # Training set 1 : ring-separable.txt
    # Training set 2 : ring-merged.txt
    # Two different training sets, one easy (separable) and one more difficult (merged)

    # Test (valid for set1 and set2): ring-test.txt (Only one test set for both training sets)
    # 2 input features + 1 class identifier (0 / 1)
    # All data files have 10000 patterns
    
A2_ring_merged=pd.read_csv('C:/Users/Gari/Desktop/Assignments_NEC/A2/A2-ring/A2-ring-merged.txt', sep='\t', header=None)
A2_ring_separable=pd.read_csv('C:/Users/Gari/Desktop/Assignments_NEC/A2/A2-ring/A2-ring-separable.txt', sep='\t', header=None)
A2_ring_test=pd.read_csv('C:/Users/Gari/Desktop/Assignments_NEC/A2/A2-ring/A2-ring-test.txt', sep='\t', header=None)

df_A2_ring_merged=pd.DataFrame(A2_ring_merged)
df_A2_ring_separable=pd.DataFrame(A2_ring_separable)
df_A2_ring_test=pd.DataFrame(A2_ring_test)

# We plot the two input features

In [3]:
# Second dataset: File: A2-bank.txt
    # Data: bank-additional.csv (4119 patterns) or bank-additional-full.csv (41188 patterns), we choose one of them (the first is a subset of the second)
    # Training: select the first 80% patterns for training
    # Test: select the last 20% patterns for test
    # Features: 20 features, most of them categorical, you will have to properly represent them as numerical data before training
    # Input features: features that refer to the bank client, last contact in the current campaign, other attributes, and social and economic context attributes
    # Prediction feature: the last one (yes/no), which corresponds to whether the client has subscribed a term deposit or not
    # Observation: missing information is tagged as “unknown”

bank_additional=pd.read_csv('C:/Users/Gari/Desktop/Assignments_NEC/A2/A2-bank/bank-additional.csv', sep=';', header=None)
#bank_additional_full=pd.read_csv('C:/Users/Gari/Desktop/Assignments_NEC/A2/A2-bank/bank-additional-full.csv', sep=';', header=None)
#bank_additional_names= pd.read_csv('C:/Users/Gari/Desktop/Assignments_NEC/A2/A2-bank/bank-additional-names.txt', sep="\t", header=None)

df_bank_additional=pd.DataFrame(bank_additional)
#df_bank_additional_full=pd.DataFrame(bank_additional_full)

In [4]:
# Third dataset: from "https://www.kaggle.com/datasets/fatemehmehrparvar/liver-disorders?resource=download"
    # At least 6 features, one of them used for classification
    # he classification feature can be binary or multivariate
    # At least 400 patterns
    # Select randomly 80% of the patterns for training and validation, and the remaining 20% for test; it is important to shuffle the original data, to destroy any kind of sorting it could have

# Indian liver patient dataset [584 rows x 11 columns]
liver_Disorder=pd.read_csv('C:/Users/Gari/Desktop/Assignments_NEC/A2/Indian Liver Patient Dataset (ILPD).csv', sep=',', header=None)

# data (as pandas dataframes) 
df_liver_Disorder=pd.DataFrame(liver_Disorder)

#We drop the header
# Drop the first row
df_liver_Disorder = df_liver_Disorder.drop(df_liver_Disorder.index[0])

Now, we will do the data preprocessing to later do the data splitting.

In [5]:
# Handling missing values, we check for and handle any missing values in our datasets
# Categorical values, if there are categorical variables, we encode them appropriately
# Outliers, we identify and handle the outliers in the data
# Normalization, in case is needed

# Data Preprocessing for Dataset 1 and 2
# - Normalize input and output variables
# - No need to preprocess (datasets already cleaned)

# Data Preprocessing for Dataset 3
# - Link to the source webpage to the documentation: ""https://www.kaggle.com/datasets/fatemehmehrparvar/liver-disorders?resource=download"
# - Check for missing values, represent categorical values, look for outliers
# - Normalize input/output variables if needed

A2-ring dataset

In [6]:
# A2-ring
# We normalize the data that has been already splitted
X_A2_ring_separable = df_A2_ring_separable.iloc[:, :-1]  # Features separable (all columns except the last one)
y_A2_ring_separable = df_A2_ring_separable.iloc[:, -1]  # Target variable

X_A2_ring_merged = df_A2_ring_merged.iloc[:, :-1]  # Features merged
y_A2_ring_merged = df_A2_ring_merged.iloc[:, -1]  # Target variable

X_A2_ring_test = df_A2_ring_test.iloc[:, :-1]  # Features
y_A2_ring_test = df_A2_ring_test.iloc[:, -1]  # Target variable

scaler_ring = MinMaxScaler()
X_train_ring_separable = scaler_ring.fit_transform(X_A2_ring_separable) # Training set 1
# Reshape the array to a 2D shape (required by MinMaxScaler)
y_train_ring_separable = scaler_ring.fit_transform(y_A2_ring_separable.values.reshape(-1, 1))

X_train_ring_merged = scaler_ring.fit_transform(X_A2_ring_merged) # Trainig set 2
y_train_ring_merged = scaler_ring.fit_transform(y_A2_ring_merged.values.reshape(-1, 1))

X_test_ring_normalized = scaler_ring.fit_transform(X_A2_ring_test) # Test
y_test_ring_normalized = scaler_ring.fit_transform(y_A2_ring_test.values.reshape(-1, 1))

A2-bank dataset

In [7]:
# A2-bank

# Before normalizing the second dataset, it needs to be treated.
# We have to treat the categorical data and the "unknown" values.

# Treatment of the categorical data and the "unknown" values
# Replace "unknown" values with NaN
df_bank_additional.replace("unknown", np.nan, inplace=True)

# Handling missing values
df_bank_additional.fillna(df_bank_additional.mode().iloc[0], inplace=True)

# Extract the column names from the first row
df_bank_additional.columns = df_bank_additional.iloc[0]
df_bank_additional = df_bank_additional[1:]

# Apply label encoding to categorical columns
label_encoder = LabelEncoder()
categorical_columns = df_bank_additional.select_dtypes(include="object").columns

for column in categorical_columns:
    df_bank_additional[column] = label_encoder.fit_transform(df_bank_additional[column])

# Separate the target variable
X_bank_additional = df_bank_additional.drop("y", axis=1)  # Features
y_bank_additional = df_bank_additional["y"]  # Target variable

In [8]:
# A2-bank
# Now, we normalize the data
scaler_bank = MinMaxScaler()
X_train_bank_additional = scaler_bank.fit_transform(X_bank_additional) # bank additional
# Reshape the array to a 2D shape (required by MinMaxScaler)
y_train_bank_additional = scaler_bank.fit_transform(y_bank_additional.values.reshape(-1, 1))

In [9]:
# A2-bank
# Split the data into validation-training and testing sets
# Extract the first 80% for training
# Extract the remaining 20% for testing
# Splitting A2-bank dataset
X_train_bank, X_test_bank, y_train_bank, y_test_bank = train_test_split(
    X_train_bank_additional, y_train_bank_additional, test_size=0.20, shuffle=False
)

Liver disorder dataset

In [10]:
# Liver disorder
# We check if this dataset have missing values:
missing_values_count = df_liver_Disorder.isnull().sum().sum()
print(f"Number of missing values in Liver disorder dataset: {missing_values_count}")

Number of missing values in Liver disorder dataset: 4


In [11]:
# We convert the categorical data into numerical
# Observing the dataset, there is a column with categorical data, which is the "Gender" with the name 1
gender_mapping = {'Female': 0, 'Male': 1}
df_liver_Disorder[1] = df_liver_Disorder[1].replace(gender_mapping)

In [12]:
# We handle missing values using median imputation
imputer = SimpleImputer(strategy='median')
df_liver_Disorder_imputed = pd.DataFrame(imputer.fit_transform(df_liver_Disorder), columns=df_liver_Disorder.columns)

In [13]:
missing_values_count = df_liver_Disorder_imputed.isnull().sum().sum()
print(f"Number of missing values in Liver disorder dataset: {missing_values_count}")

Number of missing values in Liver disorder dataset: 0


In [14]:
# Identify and handle outliers using IQR method
def handle_outliers_iqr(data, threshold=1.5):
    data_copy = data.copy()  # Create a copy to avoid SettingWithCopyWarning
    Q1 = data_copy.quantile(0.25)
    Q3 = data_copy.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    data_copy[(data_copy < lower_bound) | (data_copy > upper_bound)] = np.nan

    # Handle missing values using median imputation
    imputer = SimpleImputer(strategy='median')
    data_imputed = imputer.fit_transform(data_copy)
    
    # Convert back to DataFrame with original column names
    data_imputed = pd.DataFrame(data_imputed, columns=data.columns)

    return data_imputed

# Handle outliers in all feature variables (columns) of df_liver_Disorder
df_liver_Disorder_no_outliers = handle_outliers_iqr(df_liver_Disorder_imputed)

# Shuffle
df_liver_Disorder_shuffled = df_liver_Disorder_no_outliers.sample(frac=1, random_state=42)

In [15]:
missing_values_count = df_liver_Disorder_shuffled.isnull().sum().sum()
print(f"Number of missing values in Liver disorder dataset: {missing_values_count}")

Number of missing values in Liver disorder dataset: 0


In [16]:
X_liver_Disorder = df_liver_Disorder_shuffled.iloc[:, :-1]
y_liver_Disorder = df_liver_Disorder_shuffled.iloc[:, -1]

# Normalize input variables
scaler_liver_Disorder = MinMaxScaler()
X_liver_Disorder_normalized_no_outliers = scaler_liver_Disorder.fit_transform(X_liver_Disorder)
y_liver_Disorder_normalized_no_outliers = scaler_liver_Disorder.fit_transform(y_liver_Disorder.values.reshape(-1, 1))

In [17]:
# Third dataset, Liver disorder
# Split the data into validation-training and testing sets
# Extract the first 80% for training
# Extract the remaining 20% for testing
# Splitting liver_Disorder dataset
X_train_wineQuality, X_test_wineQuality, y_train_wineQuality, y_test_wineQuality = train_test_split(
    X_liver_Disorder_normalized_no_outliers,
    y_liver_Disorder_normalized_no_outliers,
    test_size=0.2,
    random_state=42,
)

Part 2: Classification problem

We are going to perform supervised training of 3 classification models.

SVM (support vector machine)

In [18]:
# Parameters of SVM: kernel, and parameters
def evaluate_svm(X_train, y_train, X_test, y_test, kernel, C):
    # Create an SVM classifier with the desired parameters
    svm_classifier = SVC(kernel=kernel, C=C)
    
    # Perform cross-validation
    cv_scores = cross_val_score(svm_classifier, X_train, y_train, cv=5)  # 5-fold cross-validation
    expected_error = 1 - np.mean(cv_scores)  # Expected classification error
    
    # Train the SVM classifier on the entire training set
    svm_classifier.fit(X_train, y_train)
    
    # Predict the labels for the test set
    y_pred = svm_classifier.predict(X_test)
    
    # Calculate the classification error on the test set
    test_error = 1 - accuracy_score(y_test, y_pred)
    
    return expected_error, test_error

BP (back propagation)

In [19]:
# Parameters of BP: architecture of the network, learning rate and momentum, activation function, and number of epochs

def evaluate_bp(X_train, y_train, X_test, y_test, learning_rate, momentum, activation, epochs):
    import tensorflow as tf
    
    # Define the neural network model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation=activation),
        tf.keras.layers.Dense(10, activation=activation),
        tf.keras.layers.Dense(5, activation=activation),
        tf.keras.layers.Dense(1)
    ])
    
    # Define the optimizer with custom learning rate and momentum
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=momentum)
    
    # Compile the model
    model.compile(optimizer=optimizer, loss='mse', metrics=['accuracy'])
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    expected_error = 1 - np.mean(cv_scores)  # Expected classification error
    
    # Fit the model to the entire training set
    model.fit(X_train, y_train, epochs=epochs, batch_size=32, verbose=0)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the classification error on the test set
    test_error = 1 - accuracy_score(y_test, y_pred.round())
    
    return expected_error, test_error

MLR (multi-linear regression)

In [20]:
def evaluate_mlr(X_train, y_train, X_test, y_test, C, solver):
    from sklearn.linear_model import LogisticRegression
    
    # Create a logistic regression classifier with the current parameters
    logistic_regression = LogisticRegression(C, solver)
    
    # Perform cross-validation
    cv_scores = cross_val_score(logistic_regression, X_train, y_train, cv=5)  # 5-fold cross-validation
    expected_error = 1 - np.mean(cv_scores)  # Expected classification error
    
    # Train the logistic regression classifier on the entire training set
    logistic_regression.fit(X_train, y_train)
    
    # Predict the labels for the test set
    y_pred = logistic_regression.predict(X_test)
    
    # Calculate the classification error on the test set
    test_error = 1 - accuracy_score(y_test, y_pred)
    
    return expected_error, test_error

Automatic process to find the best parameters (kernel, C) for the SVM model

In [21]:
def tune_svm(X_train, y_train, X_test, y_test):
    # Set initial parameters
    C_values = [0.1, 1, 10, 100, 1000]  # Values for C
    kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']  # Values for kernel
    best_error = float('inf')
    best_params = {'kernel': None, 'C': None}

    # Tune C
    for C in C_values:
        # Calculate the classification error
        expected_error, test_error = evaluate_svm(X_train, y_train, X_test, y_test, kernel=kernel_values[0], C=C)
        
        # Update best error and parameters if current error is lower
        if test_error < best_error:
            best_error = test_error
            best_params['C'] = C
        
    # Tune kernel
    for kernel in kernel_values:
        expected_error, test_error = evaluate_svm(X_train, y_train, X_test, y_test, kernel=kernel_values[0], C=C)
        
        # Update best error and parameters if current error is lower
        if test_error < best_error:
            best_error = test_error
            best_params['kernel'] = kernel
    
    return best_params, best_error

Automatic process to find the best parameters (learning_rate, momentum, activation, epochs) for the BP model

In [22]:
def tune_backpropagation(X_train, y_train, X_test, y_test):
    # Set initial parameters
    learning_rates = [0.2, 0.15, 0.1, 0.015, 0.01]  # Values for learning rate
    momentums = [0.0, 0.3, 0.6, 0.9]  # Values for momentum
    activations = ['sigmoid', 'relu', 'linear', 'tanh']  # Values for activation
    epochs_values = [50, 100, 200, 500, 1000] # Number of epochs

    best_error  = float('inf')  # Initialize with a large value
    best_hyperparameters = {'learning_rate': None, 'momentum': None, 'activation': None, 'epochs': None}

 # Tune learning rate
    for learning_rate in learning_rates:
        test_error = evaluate_bp(X_train, y_train, X_test, y_test, learning_rate, momentums[0], activations[0], epochs_values[0])
        if test_error < best_error:
            best_error = test_error
            best_hyperparameters['learning_rate'] = learning_rate

    # Tune momentum
    for momentum in momentums:
        test_error = evaluate_bp(X_train, y_train, X_test, y_test, best_hyperparameters['learning_rate'], momentum, activations[0], epochs_values[0])
        if test_error < best_error:
            best_error = test_error
            best_hyperparameters['momentum'] = momentum

    # Tune activation
    for activation in activations:
        test_error = evaluate_bp(X_train, y_train, X_test, y_test, best_hyperparameters['learning_rate'], best_hyperparameters['momentum'], activations, epochs_values[0])
        if test_error < best_error:
            best_error = test_error
            best_hyperparameters['activation'] = activation

    # Tune epochs
    for epochs in epochs_values:
        test_error = evaluate_bp(X_train, y_train, X_test, y_test, best_hyperparameters['learning_rate'], best_hyperparameters['momentum'], best_hyperparameters['activation'], epochs_values)
        if test_error < best_error:
            best_error = test_error
            best_hyperparameters['epochs'] = epochs

    return best_hyperparameters, best_error

Automatic process to find the best parameters (C, solver) for the MLR model

In [23]:
def tune_logistic_regression(X_train, y_train, X_test, y_test):
    # Set initial parameters
    C_values = [0.01, 0.1, 1, 10, 100]  # Values for regularization parameter C
    solver_values = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']  # Values for the optimization algorithm

    best_error = float('inf')  # Initialize with a large value
    best_params = {'C': None, 'solver': None}

# Tune C
    for C in C_values:
        # Create a logistic regression classifier with the current C value
        logistic_regression = LogisticRegression(C=C, solver=solver_values[1])

        # Train the logistic regression classifier
        logistic_regression.fit(X_train, y_train)

        # Predict the labels for the test set
        y_pred = logistic_regression.predict(X_test)

        # Calculate the classification error on the test set
        test_error = 1 - accuracy_score(y_test, y_pred)

        # Update best error and C if current error is lower
        if test_error < best_error:
            best_error = test_error
            best_params['C'] = C

    # Tune solver
    for solver in solver_values:
        # Create a logistic regression classifier with the current solver
        logistic_regression = LogisticRegression(C=best_params['C'], solver=solver)

        # Train the logistic regression classifier
        logistic_regression.fit(X_train, y_train)

        # Predict the labels for the test set
        y_pred = logistic_regression.predict(X_test)

        # Calculate the classification error on the test set
        test_error = 1 - accuracy_score(y_test, y_pred)

        # Update best error and solver if current error is lower
        if test_error < best_error:
            best_error = test_error
            best_params['solver'] = solver

    return best_params, best_error

We call the before functions to finde the best parameters for our datasets for the three models (SVM, BP, MLR):

Dataset 1: A2-ring

In [24]:
X_train = X_train_ring_merged
y_train = y_train_ring_merged
X_test = X_test_ring_normalized
y_test = y_test_ring_normalized

warnings.filterwarnings("ignore")

#SVM
best_params_svm, best_error_svm = tune_svm(X_train, y_train, X_test, y_test)
print("Best parameters for SVM:", best_params_svm)
print("Best classification error for SVM:", best_error_svm)

# BP
best_params_bp, best_error_bp = tune_backpropagation(X_train, y_train, X_test, y_test)
print("Best parameters for BP:", best_params_bp)
print("Best classification error for BP:", best_error_bp)

#MLR
best_params_mlr, best_error_mlr = tune_logistic_regression(X_train, y_train, X_test, y_test)
print("Best parameters for MLR:", best_params_mlr)
print("Best classification error for MLR:", best_error_mlr)

Best parameters for SVM: {'kernel': 'rbf', 'C': 0.1}
Best classification error for SVM: 0.04300000000000004



TypeError: Cannot clone object '<keras.src.engine.sequential.Sequential object at 0x00000128DAC59410>' (type <class 'keras.src.engine.sequential.Sequential'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

Dataset 2: A2-bank

In [None]:
X_train = X_train_bank
y_train = y_train_bank
X_test = X_test_bank
y_test = y_test_bank

warnings.filterwarnings("ignore")

#SVM
best_params_svm, best_error_svm = tune_svm(X_train, y_train, X_test, y_test)
print("Best parameters for SVM:", best_params_svm)
print("Best classification error for SVM:", best_error_svm)

# BP
best_params_bp, best_error_bp = tune_backpropagation(X_train, y_train, X_test, y_test)
print("Best parameters for BP:", best_params_bp)
print("Best classification error for BP:", best_error_bp)

#MLR
best_params_mlr, best_error_mlr = tune_logistic_regression(X_train, y_train, X_test, y_test)
print("Best parameters for MLR:", best_params_mlr)
print("Best classification error for MLR:", best_error_mlr)

Dataset 3: liver disease

In [None]:
X_train = X_train_wineQuality
y_train = y_train_wineQuality
X_test = X_test_wineQuality
y_test = y_test_wineQuality

warnings.filterwarnings("ignore")

#SVM
best_params_svm, best_error_svm = tune_svm(X_train, y_train, X_test, y_test)
print("Best parameters for SVM:", best_params_svm)
print("Best classification error for SVM:", best_error_svm)

# BP
best_params_bp, best_error_bp = tune_backpropagation(X_train, y_train, X_test, y_test)
print("Best parameters for BP:", best_params_bp)
print("Best classification error for BP:", best_error_bp)

#MLR
best_params_mlr, best_error_mlr = tune_logistic_regression(X_train, y_train, X_test, y_test)
print("Best parameters for MLR:", best_params_mlr)
print("Best classification error for MLR:", best_error_mlr)