In [1]:
#pip install ucimlrepo

# Packages
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split

URV                                                                            MESIIA

Neural and Evolutionary Computation (NEC)
Assignment 1: Prediction with Back-Propagation and Linear Regression

Teachers: Dr. Jordi Duch, Dr. Sergio Gomez
Student: Natzaret Gálvez Rísquez

Part 1: Selecting and analyzing the datasets

We perform the predictions on  three datasets: 

In [2]:
# We upload the datasets

# First dataset: File: A1-turbine.txt
    # 5 features: the first 4 are the input variables, the last one is the value to predict
    # 451 patterns: use the first 85% for training and validation, and the remaining 15% for test
df_turbine=pd.read_csv('C:/Users/Gari/Desktop/NEC/A1-turbine.txt', sep='\t', header=None)
header_vector_turbine = df_turbine.iloc[0, :].tolist() #header
df_turbine=df_turbine.iloc[1:,:]
df_turbine=pd.DataFrame(df_turbine)

# Second dataset: File: A1-synthetic.txt
    # 10 features: the first 9 are the input variables, the last one is the value to predict
    # 1000 patterns: use the first 80% for training and validation, and the remaining 20% for test
df_synthetic=pd.read_csv('C:/Users/Gari/Desktop/NEC/A1-synthetic.txt', sep='\t', header=None)
header_vector_synthetic = df_synthetic.iloc[0, :].tolist() #header
df_synthetic=df_synthetic.iloc[1:,:]
df_synthetic=pd.DataFrame(df_synthetic)

# Third dataset: from "https://archive.ics.uci.edu/dataset/186/wine+quality"
    # At least 6 features, one of them used for prediction
    # The prediction variable must take real (float or double) values; it should not represent a categorical value (that would correspond to a classification task)
    # At least 400 patterns
    # Select randomly 80% of the patterns for training and validation, and the remaining 20% for test; it is important to shuffle the original data, to destroy any kind of sorting it could have

# Wine Quality dataset [6496 rows x 11 columns]
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
df_wineQuality = wine_quality.data.features 
y = wine_quality.data.targets #quality of wine, an integer
  
# metadata 
#print(wine_quality.metadata) 
# variable information 
#print(wine_quality.variables) 

header_vector_wineQuality = df_wineQuality.columns.tolist() #header

In [3]:
# As we can observe by the following header of the wine quality, alcohol level is the last feature
# We will use it as the value to predict
print(header_vector_wineQuality)

['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol']


Now, we will do the data preprocessing to later do the data splitting.

In [4]:
# Handling missing values, we check for and handle any missing values in our datasets
# Categorical values, if there are categorical variables, we encode them appropriately
# Outliers, we identify and handle the outliers in the data
# Normalization, in case is needed

# Data Preprocessing for Dataset 1 and 2
# - Normalize input and output variables
# - No need to preprocess (datasets already cleaned)

# Data Preprocessing for Dataset 3
# - Link to the source webpage to the documentation: "https://archive.ics.uci.edu/dataset/186/wine+quality"
# - Check for missing values, represent categorical values, look for outliers
# - Normalize input/output variables if needed

In [5]:
##Turbine dataset
X_turbine = df_turbine.iloc[:, :-1]  # Features (all columns except the last one)
y_turbine = df_turbine.iloc[:, -1]   # Target variable (last column)

scaler_turbine = MinMaxScaler()
X_turbine_normalized = scaler_turbine.fit_transform(X_turbine)
#y_turbine_normalized = scaler_turbine.fit_transform(y_turbine.values.reshape(-1, 1))
# Because the prediction column has all NaN values, it is not necessary to reshape

In [6]:
##Synthetic dataset
X_synthetic = df_synthetic.iloc[:, :-1]
y_synthetic = df_synthetic.iloc[:, -1]

# Normalize input and output variables
scaler_synthetic = MinMaxScaler()
X_synthetic_normalized = scaler_synthetic.fit_transform(X_synthetic)
y_synthetic_normalized = scaler_synthetic.fit_transform(y_synthetic.values.reshape(-1, 1))

In [7]:
##Wine Quality dataset
#By the owners we know that this dataset has not missing values, we can check by:
missing_values_count = df_wineQuality.isnull().sum().sum()
print(f"Number of missing values in Wine Quality dataset: {missing_values_count}")

Number of missing values in Wine Quality dataset: 0


In [8]:
##Wine Quality dataset
# No categorical variables in this dataset
# Identify and handle outliers using IQR method
def handle_outliers_iqr(data, threshold=1.5):
    data_copy = data.copy()  # Create a copy to avoid SettingWithCopyWarning
    Q1 = data_copy.quantile(0.25)
    Q3 = data_copy.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    data_copy[(data_copy < lower_bound) | (data_copy > upper_bound)] = np.nan
    return data_copy

# Handle outliers in all feature variables (columns) of df_wineQuality
df_wineQuality_no_outliers = handle_outliers_iqr(df_wineQuality)

#Shuffle
df_wineQuality_shuffled = df_wineQuality_no_outliers.sample(frac=1, random_state=42)

X_wineQuality = df_wineQuality_shuffled.iloc[:, :-1]
y_wineQuality = df_wineQuality_shuffled.iloc[:, -1]

# Normalize input and output variables
scaler_wineQuality = StandardScaler()
X_wineQuality_normalized_no_outliers = scaler_wineQuality.fit_transform(X_wineQuality)
y_wineQuality_normalized_no_outliers = scaler_wineQuality.fit_transform(y_wineQuality.values.reshape(-1, 1))

Now, we divide the datasets into validation & training and test.

In [13]:
#First dataset, turbine
# Split the data into validation-training and testing sets
# Calculate the index to split at
split_index1 = int(0.85 * len(df_turbine))
# Extract the first 85% for training
# Because we normalized with "MinMaxScaler", the result is a NumPy array, we conver it to a Pandas DataFrame
X_turbine_normalized_df = pd.DataFrame(X_turbine_normalized, columns=df_turbine.columns[:-1])
X_turbine_training_validation = X_turbine_normalized_df.iloc[:split_index1, :]
y_turbine_training_validation = y_turbine.iloc[:split_index1]
# Extract the remaining 15% for testing
X_turbine_testing = X_turbine_normalized_df.iloc[split_index1:, :]
y_turbine_testing = y_turbine.iloc[split_index1:]

#Second dataset, synthetic
split_index2 = int(0.80 * len(df_synthetic)) # split 80% for training and validation and 20% for testing
X_synthetic_normalized_df = pd.DataFrame(X_synthetic_normalized, columns=df_synthetic.columns[:-1])
y_synthetic_normalized_df = pd.DataFrame(y_synthetic_normalized, columns=[df_synthetic.columns[-1]])
X_synthetic_training_validation = X_synthetic_normalized_df.iloc[:split_index2, :]
y_synthetic_training_validation = y_synthetic_normalized_df.iloc[:split_index2]
X_synthetic_testing = X_synthetic_normalized_df.iloc[split_index2:, :]
y_synthetic_testing = y_synthetic_normalized_df.iloc[split_index2:]

#Third dataset, wineQuality
# Calculate the split indices
split_index3 = int(0.8 * len(df_wineQuality))
# Split the data into training + validation and test sets
X_wineQuality_normalized_df = pd.DataFrame(X_wineQuality_normalized_no_outliers, columns=df_wineQuality.columns[:-1])
y_wineQuality_normalized_df = pd.DataFrame(y_wineQuality_normalized_no_outliers, columns=[df_wineQuality.columns[-1]])
X_wineQuality_training_validation = X_wineQuality_normalized_df.iloc[:split_index3, :]
y_wineQuality_training_validation = y_wineQuality_normalized_no_outliers.iloc[:split_index3]
X_wineQuality_testing = X_wineQuality_normalized_df.iloc[split_index3:, :]
y_wineQuality_testing = y_wineQuality_normalized_no_outliers.iloc[split_index3:]

# Print the sizes of the datasets
#print("Total data size:", len(df_wineQuality))
#print("Training data size:", len(df_wineQualityTrainingValidation))
#print("Test data size:", len(df_wineQualityTesting))

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

Part 2: Implementation of BP

Part 3: Obtaining and comparing predictions using the three models (BP, BP-F, MLR-F)

Part 3.1: Parameter comparison and selection

Part 3.2: Model result comparison