In [168]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.optimizers import Adam, RMSprop
import scikeras
from sklearn.metrics import classification_report

# Data Preparation

## Load Data

In [169]:
# Load the datasets
red_wine_data = pd.read_csv('winequality-red.csv', delimiter=';')
white_wine_data = pd.read_csv('winequality-white.csv', delimiter=';')

In [170]:
# Add a 'type' column to indicate the wine type
red_wine_data['type'] = 'red'
white_wine_data['type'] = 'white'

In [171]:
# Merge the datasets
wine_data = pd.concat([red_wine_data, white_wine_data], axis=0)
# axis=0 means that the concatenation is done vertically, stacking the rows of red_wine_data below the rows of white_wine_data

In [172]:
# Encode the 'type' column
label_encoder = LabelEncoder()
wine_data['type'] = label_encoder.fit_transform(wine_data['type'])

In [173]:
#Encoding the 'type' column is necessary because machine learning algorithms typically operate on numerical data, and the 'type' column contains categorical data (red wine / white wine) which is non-numeric.
#To train a neural network model to classify the type of wine, we need to convert the categorical values into numerical representations. This process is called encoding. It allows the model to understand and make predictions based on the encoded values.
#There are different encoding techniques available, but in this case, we can use one-hot encoding. One-hot encoding transforms the categorical values into binary vectors. It creates new binary columns for each unique category and assigns a value of 1 or 0 to indicate whether a particular sample belongs to that category or not.
#For example, after one-hot encoding, the 'type' column will be transformed into two columns: 'red' and 'white'. If a sample belongs to the red wine category, the 'red' column will have a value of 1 and the 'white' column will have a value of 0. If a sample belongs to the white wine category, the 'red' column will have a value of 0 and the 'white' column will have a value of 1.
#By encoding the 'type' column, we can represent the categorical information in a format that can be effectively used by the neural network model for classification.

## Preprocess the data

In [174]:
# Data Cleaning
wine_data.dropna(inplace=True)  # Drop rows with missing values

In [175]:
# Prepare the data for modeling
X = wine_data.drop('type', axis=1)
y = wine_data['type']

In [176]:
# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [177]:
# Perform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Model Development

## Model 1

In [178]:
model1 = Sequential()
model1.add(Dense(32, activation='relu', input_shape=(12,)))
model1.add(Dense(16, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_val_scaled, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x288ed307b20>

## Model 2

In [179]:
model2 = Sequential()
model2.add(Dense(64, activation='relu', input_shape=(12,)))
model2.add(Dense(32, activation='relu'))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model2.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x288e6c2ac50>

## ModeL 3

In [180]:
model3 = Sequential()
model3.add(Dense(128, activation='relu', input_shape=(12,)))
model3.add(Dense(64, activation='relu'))
model3.add(Dense(32, activation='relu'))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model3.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x288c6022bc0>

## Model 4

In [181]:
model4 = Sequential()
model4.add(Dense(16, activation='relu', input_shape=(12,)))
model4.add(Dense(8, activation='relu'))
model4.add(Dense(4, activation='relu'))
model4.add(Dense(1, activation='sigmoid'))
model4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model4.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x288db983550>

## Model 5

In [182]:
model5 = Sequential()
model5.add(Dense(64, activation='relu', input_shape=(12,)))
model5.add(Dense(32, activation='relu'))
model5.add(Dense(16, activation='relu'))
model5.add(Dense(8, activation='relu'))
model5.add(Dense(4, activation='relu'))
model5.add(Dense(1, activation='sigmoid'))
model5.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model5.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x288d95b2260>

**Each model has a different architecture with varying numbers of layers and neurons.**

## Showing the network architecture of the 5 models in figures

In [183]:
model1.summary()
model2.summary()
model3.summary()
model4.summary()
model5.summary()

Model: "sequential_456"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1396 (Dense)          (None, 32)                416       
                                                                 
 dense_1397 (Dense)          (None, 16)                528       
                                                                 
 dense_1398 (Dense)          (None, 1)                 17        
                                                                 
Total params: 961
Trainable params: 961
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_457"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1399 (Dense)          (None, 64)                832       
                                                                 
 dense_1400 (Dense)          (

# Model Tuning

## Determine The Best Model Architecture 

In [184]:
# Define a list to store the accuracy of each model
accuracy_list = []

In [185]:
# Evaluate model1
accuracy1 = model1.evaluate(X_test_scaled, y_test)[1]
accuracy_list.append(accuracy1)

# Evaluate model2
accuracy2 = model2.evaluate(X_test, y_test)[1]
accuracy_list.append(accuracy2)

# Evaluate model3
accuracy3 = model3.evaluate(X_test, y_test)[1]
accuracy_list.append(accuracy3)

# Evaluate model4
accuracy4 = model4.evaluate(X_test, y_test)[1]
accuracy_list.append(accuracy4)

# Evaluate model5
accuracy5 = model5.evaluate(X_test, y_test)[1]
accuracy_list.append(accuracy5)



In [186]:
# Create a dictionary to map model names to accuracy scores
model_accuracy = {
    'Model 1': accuracy1,
    'Model 2': accuracy2,
    'Model 3': accuracy3,
    'Model 4': accuracy4,
    'Model 5': accuracy5
}

In [187]:
# Print the accuracy of each model
for model, accuracy in model_accuracy.items():
    print(f'{model}: Accuracy = {accuracy:.4f}')


Model 1: Accuracy = 0.9931
Model 2: Accuracy = 0.9569
Model 3: Accuracy = 0.9700
Model 4: Accuracy = 0.9269
Model 5: Accuracy = 0.9408


In [188]:
# Find the model with architecture that has the highest accuracy
best_model_architecture = max(model_accuracy, key=model_accuracy.get)

In [189]:
# Print the best model
print(f'Best Model Architecture: {best_model_architecture}')

Best Model Architecture: Model 1


## Hyperparameter Tuning

**Tuning (1)optimizer (2) batch size (3) epochs**

In [190]:
# Define the function to create the base model
def create_model(optimizer='adam'):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_shape=(12,)))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [191]:
# Create the KerasClassifier wrapper
model = KerasClassifier(build_fn=create_model, verbose=0)

  model = KerasClassifier(build_fn=create_model, verbose=0)


In [192]:
# Define the hyperparameters to tune and their possible values
param_dist = {
    'batch_size': [16, 32, 64],
    'epochs': [10, 20, 30],
    'optimizer': ['adam', 'rmsprop']
}

In [193]:
# Perform randomized search cross-validation
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, scoring='accuracy', cv=3)
random_search_result = random_search.fit(X_train_scaled, y_train)



In [194]:
# Print the best hyperparameters and the corresponding accuracy
print("Best Hyperparameters: ", random_search_result.best_params_)
print("Best Accuracy: ", random_search_result.best_score_)

Best Hyperparameters:  {'optimizer': 'rmsprop', 'epochs': 30, 'batch_size': 32}
Best Accuracy:  0.9963921490997304


In [195]:
# Get the results of the randomized search
results = random_search_result.cv_results_
params = results['params']
mean_test_scores = results['mean_test_score']

In [196]:
# Refit the best model with the best hyperparameters using the entire training dataset
best_model = random_search_result.best_estimator_
best_model.model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, verbose=0)

<keras.callbacks.History at 0x288d937a410>

In [197]:
# Create a DataFrame to store the results
results_df = pd.DataFrame(params)
results_df['mean_test_score'] = mean_test_scores

In [198]:
# Sort the results by mean test score in descending order
results_df = results_df.sort_values('mean_test_score', ascending=False)

# Display the results table
results_df

Unnamed: 0,optimizer,epochs,batch_size,mean_test_score
3,rmsprop,30,32,0.996392
1,rmsprop,20,16,0.996152
0,adam,30,16,0.995911
8,adam,30,64,0.995911
4,rmsprop,30,16,0.99567
9,rmsprop,20,32,0.99567
5,adam,20,32,0.99543
6,adam,20,64,0.995189
7,rmsprop,10,32,0.995189
2,adam,10,16,0.994948


# Model Evaluation

## Evaluation On Testing Data

**Testing Loss & Testing Accuracy**

In [199]:
# Evaluate the best model on the testing data
test_loss, test_accuracy = best_model.model.evaluate(X_test_scaled, y_test)
print("Testing Loss:", test_loss)
print("Testing Accuracy:", test_accuracy)
#The testing loss represents the average loss or error of the model's predictions on the testing data
#A lower testing loss indicates better performance, as it means the model's predictions are closer to the true values.
#The testing accuracy is the proportion of correctly predicted samples in the testing data
#A higher testing accuracy indicates better performance, as it means the model is making more correct predictions.

Testing Loss: 0.034333519637584686
Testing Accuracy: 0.9953846335411072


**Validation Loss & Validation Accuracy**

In [200]:
val_loss, val_accuracy = best_model.model.evaluate(X_val_scaled, y_val)
print("Validation Loss:", val_loss)
print("Validation Accuracy:", val_accuracy)

Validation Loss: 0.007752105128020048
Validation Accuracy: 0.9971153736114502


**Prediction**

In [201]:
# Predict on the validation data
val_predictions = best_model.model.predict(X_val_scaled)
val_predictions = (val_predictions > 0.5).astype(int)


# Print classification report for validation data
print("Validation Classification Report:")
print(classification_report(y_val, val_predictions))

Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       270
           1       1.00      1.00      1.00       770

    accuracy                           1.00      1040
   macro avg       1.00      1.00      1.00      1040
weighted avg       1.00      1.00      1.00      1040



In [202]:
# Predict on the testing data
test_predictions = best_model.model.predict(X_test_scaled)
test_predictions = (test_predictions > 0.5).astype(int)

# Print classification report for testing data
print("Testing Classification Report:")
print(classification_report(y_test, test_predictions))

Testing Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       341
           1       0.99      1.00      1.00       959

    accuracy                           1.00      1300
   macro avg       1.00      0.99      0.99      1300
weighted avg       1.00      1.00      1.00      1300



**Observation:**

**Discussion:**