In [35]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import GridSearchCV
import cupy as cp
from sklearn.metrics import classification_report, confusion_matrix


In [37]:
# Get the path of the file
# Current directiory for Jupyter
script_dir = os.getcwd() 

# Check the path whether correct 
print(script_dir)

#Buld corresponding path
file_path = os.path.join(script_dir, "Data")
Original_File_path = os.path.join(file_path, "Original_Data.csv")
Data_Without_Outlier_Path = os.path.join(file_path, "Data_Without_Outlier.csv")

# Test whether path correct
print("Original File Path:", Original_File_path)
print("Data Without Outlier Path:", Data_Without_Outlier_Path)


# Read the csv files
Original_Data = pd.read_csv(Original_File_path,encoding='utf-8')
Data_Without_Outlier = pd.read_csv(Data_Without_Outlier_Path,encoding='utf-8')

# Check the csv file
print(Original_Data.head())
print(Data_Without_Outlier.head())

C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417
Original File Path: C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417\Data\Original_Data.csv
Data Without Outlier Path: C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417\Data\Data_Without_Outlier.csv
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0             

#### Process the the PCA Without Outlier data

In [40]:
# Set the target and features vector for Data_Without_Outlier
outlier_x = Data_Without_Outlier.drop(['quality'], axis=1)
outlier_y = Data_Without_Outlier['quality']

# Split data into training and test sets
Outlier_x_train, Outlier_x_test, Outlier_y_train, Outlier_y_test = train_test_split(
    outlier_x, outlier_y, test_size=0.2, random_state=42
)


# Check training and testing data shapes
print(f"Training data shape: {Outlier_x_train.shape}")
print(f"Testing data shape: {Outlier_x_test.shape}")

# Normalization data
scaler = StandardScaler()
Outlier_x_train_scaled = scaler.fit_transform(Outlier_x_train)
Outlier_x_test_scaled = scaler.transform(Outlier_x_test)

# Call the pca method from packages
# Keep all components
pca = PCA()
pca.fit(Outlier_x_train_scaled)

# Use the explained variance 
ev_result = pca.explained_variance_ratio_
#print("featire {i}  ---->    ratio: {}")
print("Exlpained variance raitor")

# Get the sum of explained variance -> cumulative variance 
cv_result = pca.explained_variance_ratio_.cumsum()
# decide how much feature contains
threshold = 0.9
n_features = (cv_result >= threshold).argmax() + 1

print("HOW MANY COMPONENTS KEEPS: ", n_features)
# Update new pca with limited components
pca = PCA(n_components=n_features)
pca.fit(Outlier_x_train_scaled)

print("Find which components have been kept:\n", pca.components_)
top_features = np.argsort(np.abs(pca.components_), axis=1)[:, ::-1]
print("Find which components has been kept:(Index)\n ", top_features)

Outlier_x_train_scaled_t = pca.transform(Outlier_x_train_scaled)
Outlier_x_test_scaled_t = pca.transform(Outlier_x_test_scaled)

# Define the parameters
nodes = [16, 32, 64]
layers = [1, 2, 3]
hidden_layer_sizes = [
    tuple([neuron] * layer) for neuron in nodes for layer in layers
]
parameter = {
    'activation': ['relu', 'tanh', 'logistic'],
    'hidden_layer_sizes': hidden_layer_sizes,
    'learning_rate_init': np.linspace(0.001, 0.1, 5),
    'max_iter': [50, 100, 150],  # This is epoch!
    'learning_rate': ['constant', 'adaptive'],
    'batch_size': [15, 30, 60]
}

# Use GridSearchCV to find the best parameters
mlp = MLPClassifier(random_state=42,early_stopping=True, validation_fraction=0.1)
grid_search = GridSearchCV(estimator=mlp, param_grid=parameter, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(Outlier_x_train_scaled_t, Outlier_y_train)

# Get the best model and parameters
best_mlp = grid_search.best_estimator_
best_params = grid_search.best_params_
print("The best parameters we got: ", best_params)

# The best crossvalidation score depend on the training set
print("The best cross-validation score: ", grid_search.best_score_)




# Predict the result based on updated model 
final_prediction_PCA = best_mlp.predict(Outlier_x_test_scaled_t)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, final_prediction_PCA)
f1_pca = f1_score(Outlier_y_test, final_prediction_PCA, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, final_prediction_PCA)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_pca)
print("The confusion_matrix of the updated model is:\n", cf_matrix)

# Classification report
print("Classification report:\n", classification_report(Outlier_y_test, final_prediction_PCA))



Training data shape: (1279, 11)
Testing data shape: (320, 11)
Exlpained variance raitor
HOW MANY COMPONENTS KEEPS:  7
Find which components have been kept:
 [[ 0.50349721 -0.21946622  0.45736688  0.18873163  0.22226281 -0.06192714
   0.01015142  0.41207779 -0.41950186  0.21187714 -0.09747018]
 [ 0.05316909 -0.39642069  0.2014572  -0.1420794  -0.30136773 -0.32592056
  -0.44747557 -0.29671099  0.0218534   0.2406422   0.48533222]
 [-0.09817501 -0.33379644  0.18198616  0.0553366  -0.1966896   0.61884599
   0.51866334 -0.16619321  0.02964326  0.28461977  0.20635978]
 [-0.04327114  0.20349882 -0.0522774   0.78619684  0.09014402 -0.03680007
  -0.0654324   0.16713219  0.32061877  0.12638839  0.41882246]
 [-0.18276764  0.02665526 -0.11708226 -0.26108443  0.55357957 -0.0096233
  -0.0917904   0.02379509  0.21890098  0.71659172 -0.08400105]
 [-0.03371095  0.14888509  0.11775918 -0.01577243  0.59449215  0.03862248
   0.08566158 -0.46572821 -0.37528231 -0.27001498  0.41192551]
 [ 0.30657756  0.64760

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Process the the Original data

In [42]:
# Set the target and features vector for Data_Without_Outlier
original_x = Original_Data.drop(['quality'], axis=1)
original_y = Original_Data['quality']

# Split data into training and test sets
original_x_train, original_x_test, original_y_train, original_y_test = train_test_split(
    original_x, original_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {original_x_train.shape}")
print(f"Testing data shape: {original_x_test.shape}")

# Normalization data
scaler = StandardScaler()
original_x_train_scaled = scaler.fit_transform(original_x_train)
original_x_test_scaled = scaler.transform(original_x_test)

# Initial the parameter range
parameter = {
    'activation': ['relu', 'tanh', 'logistic'],
    'hidden_layer_sizes': hidden_layer_sizes,
    'learning_rate_init': np.linspace(0.001, 0.1, 5),
    'max_iter': [50, 100, 150],  # This is epoch!
    'learning_rate': ['constant', 'adaptive'],
    'batch_size': [15, 30, 60]
}

# Use GridSearchCV to find the best parameters
mlp_origin = MLPClassifier(random_state=42, early_stopping=True, validation_fraction=0.1)
grid_search_origin = GridSearchCV(estimator=mlp_origin, param_grid=parameter, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_origin.fit(original_x_train_scaled, original_y_train)

# Get the best model and parameters
best_mlp_origin = grid_search_origin.best_estimator_
best_params_origin = grid_search_origin.best_params_
print("The best parameters we got: ", best_params_origin)

# The best crossvalidation score depend on the training set
print("The best cross-validation score: ", grid_search_origin.best_score_)


# Predict the result based on updated model 
final_prediction_original = best_mlp_origin.predict(original_x_test_scaled)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, final_prediction_original)
f1_orig = f1_score(Outlier_y_test, final_prediction_original, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, final_prediction_original)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_orig)
print("The confusion_matrix of the updated model is:\n", cf_matrix)

# Classification report
print("Classification report:\n", classification_report(original_y_test, final_prediction_original))


Training data shape: (1279, 11)
Testing data shape: (320, 11)
The best parameters we got:  {'activation': 'tanh', 'batch_size': 30, 'hidden_layer_sizes': (64, 64, 64), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 50}
The best cross-validation score:  0.6208425245098039

 --- Evaluation under best parameters ---
The accuracy of the updated model is:  0.575
The f1 score of the updated model is:  0.5466098259801261
The confusion_matrix of the updated model is:
 [[ 0  0  1  0  0  0]
 [ 0  0 10  0  0  0]
 [ 0  0 99 30  1  0]
 [ 0  0 48 76  8  0]
 [ 0  0  1 32  9  0]
 [ 0  0  0  1  4  0]]
Classification report:
               precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00        10
         5.0       0.62      0.76      0.69       130
         6.0       0.55      0.58      0.56       132
         7.0       0.41      0.21      0.28        42
         8.0       0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
# Set the target and features vector for Data_Without_Outlier
outlier_x = Data_Without_Outlier.drop(['quality'], axis=1)
outlier_y = Data_Without_Outlier['quality']

# Split data into training and test sets
Outlier_x_train, Outlier_x_test, Outlier_y_train, Outlier_y_test = train_test_split(
    outlier_x, outlier_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {Outlier_x_train.shape}")
print(f"Testing data shape: {Outlier_x_test.shape}")

# Normalization data
scaler = StandardScaler()
Outlier_x_train_scaled = scaler.fit_transform(Outlier_x_train)
Outlier_x_test_scaled = scaler.transform(Outlier_x_test)

# Initial the parameter range
parameter = {
    'activation': ['relu', 'tanh', 'logistic'],
    'hidden_layer_sizes': hidden_layer_sizes,
    'learning_rate_init': np.linspace(0.001, 0.1, 5),
    'max_iter': [50, 100, 150],   # This is epoch!
    'learning_rate': ['constant', 'adaptive'],
    'batch_size': [15, 30, 60]
}

# Use GridSearchCV to find the best parameters
mlp = MLPClassifier(random_state=42, early_stopping=True, validation_fraction=0.1)
grid_search = GridSearchCV(estimator=mlp, param_grid=parameter, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(Outlier_x_train_scaled, Outlier_y_train)

# Get the best model and parameters
best_mlp = grid_search.best_estimator_
best_params = grid_search.best_params_
print("The best parameters we got: ", best_params)

# The best crossvalidation score depend on the training set
print("The best cross-validation score: ", grid_search.best_score_)


# Predict the result based on updated model 
final_prediction_outlier = best_mlp.predict(Outlier_x_test_scaled)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, final_prediction_outlier)
f1_outlier = f1_score(Outlier_y_test, final_prediction_outlier, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, final_prediction_outlier)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_outlier)
print("The confusion_matrix of the updated model is:/n ", cf_matrix)

# Classification report
print("Classification report:\n", classification_report(Outlier_y_test, final_prediction_outlier))


Training data shape: (1279, 11)
Testing data shape: (320, 11)
The best parameters we got:  {'activation': 'tanh', 'batch_size': 15, 'hidden_layer_sizes': (16, 16), 'learning_rate': 'constant', 'learning_rate_init': 0.025750000000000002, 'max_iter': 50}
The best cross-validation score:  0.6294393382352942

 --- Evaluation under best parameters ---
The accuracy of the updated model is:  0.559375
The f1 score of the updated model is:  0.5491751103961525
The confusion_matrix of the updated model is:/n  [[ 0  0  1  0  0  0]
 [ 0  1  9  0  0  0]
 [ 0  3 92 35  0  0]
 [ 0  0 40 64 28  0]
 [ 0  0  0 20 22  0]
 [ 0  0  0  0  5  0]]
Classification report:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.25      0.10      0.14        10
           5       0.65      0.71      0.68       130
           6       0.54      0.48      0.51       132
           7       0.40      0.52      0.45        42
           8       0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# Initial ANN
# Set the target and features vector for Data_Without_Outlier
outlier_x = Data_Without_Outlier.drop(['quality'], axis=1)
outlier_y = Data_Without_Outlier['quality']

# Split data into training and test sets
Outlier_x_train, Outlier_x_test, Outlier_y_train, Outlier_y_test = train_test_split(
    outlier_x, outlier_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {Outlier_x_train.shape}")
print(f"Testing data shape: {Outlier_x_test.shape}")

# Normalization data
scaler = StandardScaler()
Outlier_x_train_scaled = scaler.fit_transform(Outlier_x_train)
Outlier_x_test_scaled = scaler.transform(Outlier_x_test)

# Train model
mlp_origin.fit(Outlier_x_train_scaled, Outlier_y_train)
initial_prediction = mlp_origin.predict(Outlier_x_test_scaled)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, initial_prediction)
f1_outlier = f1_score(Outlier_y_test, initial_prediction, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, initial_prediction)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_outlier)
print("The confusion_matrix of the updated model is:/n ", cf_matrix)

# Classification report
print("Classification report:\n", classification_report(Outlier_y_test, initial_prediction))



Training data shape: (1279, 11)
Testing data shape: (320, 11)

 --- Evaluation under best parameters ---
The accuracy of the updated model is:  0.546875
The f1 score of the updated model is:  0.5184942455242967
The confusion_matrix of the updated model is:/n  [[  0   0   1   0   0   0]
 [  0   0   7   3   0   0]
 [  0   0 103  27   0   0]
 [  0   0  57  61  14   0]
 [  0   0   1  30  11   0]
 [  0   0   0   2   3   0]]
Classification report:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00        10
           5       0.61      0.79      0.69       130
           6       0.50      0.46      0.48       132
           7       0.39      0.26      0.31        42
           8       0.00      0.00      0.00         5

    accuracy                           0.55       320
   macro avg       0.25      0.25      0.25       320
weighted avg       0.50      0.55      0.52       320



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
# Set the target and features vector for Data_Without_Outlier
outlier_x = Data_Without_Outlier.drop(['quality'], axis=1)
outlier_y = Data_Without_Outlier['quality']

# Split data into training and test sets
Outlier_x_train, Outlier_x_test, Outlier_y_train, Outlier_y_test = train_test_split(
    outlier_x, outlier_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {Outlier_x_train.shape}")
print(f"Testing data shape: {Outlier_x_test.shape}")

# Normalization data
scaler = StandardScaler()
Outlier_x_train_scaled = scaler.fit_transform(Outlier_x_train)
Outlier_x_test_scaled = scaler.transform(Outlier_x_test)

# Initial the parameter range
parameter = {
    'activation': ['relu', 'tanh', 'logistic'],
    'hidden_layer_sizes': hidden_layer_sizes,
    'learning_rate_init': np.linspace(0.001, 0.1, 5),
    'max_iter': [50, 100, 150],   # This is epoch!
    'learning_rate': ['constant', 'adaptive'],
    'batch_size': [15, 30, 60]
}

# Use GridSearchCV to find the best parameters
mlp = MLPClassifier(random_state=42, early_stopping=True, validation_fraction=0.1)
grid_search = GridSearchCV(estimator=mlp, param_grid=parameter, cv=9, scoring='accuracy', n_jobs=-1)
grid_search.fit(Outlier_x_train_scaled, Outlier_y_train)

# Get the best model and parameters
best_mlp = grid_search.best_estimator_
best_params = grid_search.best_params_
print("The best parameters we got: ", best_params)

# The best crossvalidation score depend on the training set
print("The best cross-validation score: ", grid_search.best_score_)


# Predict the result based on updated model 
final_prediction_outlier = best_mlp.predict(Outlier_x_test_scaled)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, final_prediction_outlier)
f1_outlier = f1_score(Outlier_y_test, final_prediction_outlier, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, final_prediction_outlier)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_outlier)
print("The confusion_matrix of the updated model is:/n ", cf_matrix)

# Classification report
print("Classification report:\n", classification_report(Outlier_y_test, final_prediction_outlier))


Training data shape: (1279, 11)
Testing data shape: (320, 11)
The best parameters we got:  {'activation': 'relu', 'batch_size': 15, 'hidden_layer_sizes': (64, 64, 64), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 50}
The best cross-validation score:  0.6309465182704619

 --- Evaluation under best parameters ---
The accuracy of the updated model is:  0.603125
The f1 score of the updated model is:  0.5815933501384316
The confusion_matrix of the updated model is:/n  [[ 0  0  1  0  0  0]
 [ 0  1  7  2  0  0]
 [ 0  1 89 39  1  0]
 [ 0  0 30 94  8  0]
 [ 0  1  0 32  9  0]
 [ 0  0  0  1  4  0]]
Classification report:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.33      0.10      0.15        10
           5       0.70      0.68      0.69       130
           6       0.56      0.71      0.63       132
           7       0.41      0.21      0.28        42
           8       0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# Initial ANN
# Set the target and features vector for Data_Without_Outlier
original_x = Original_Data.drop(['quality'], axis=1)
original_y = Original_Data['quality']

# Split data into training and test sets
original_x_train, original_x_test, original_y_train, original_y_test = train_test_split(
    original_x, original_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {original_x_train.shape}")
print(f"Testing data shape: {original_x_test.shape}")

# Normalization data
scaler = StandardScaler()
original_x_train_scaled = scaler.fit_transform(original_x_train)
original_x_test_scaled = scaler.transform(original_x_test)

# Train model
mlp_origin.fit(original_x_train_scaled, original_y_train)
initial_prediction = mlp_origin.predict(original_x_test)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(original_y_test, initial_prediction)
f1_outlier = f1_score(original_y_test, initial_prediction, average='weighted')
cf_matrix = confusion_matrix(original_y_test, initial_prediction)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_outlier)
print("The confusion_matrix of the updated model is:/n ", cf_matrix)

# Classification report
print("Classification report:\n", classification_report(original_y_test, initial_prediction))



Training data shape: (1279, 11)
Testing data shape: (320, 11)

 --- Evaluation under best parameters ---
The accuracy of the updated model is:  0.409375
The f1 score of the updated model is:  0.28686190338854645
The confusion_matrix of the updated model is:/n  [[  0   0   0   1   0   0]
 [  0   0   5   2   3   0]
 [  0   0 119   4   7   0]
 [  0   0 120   3   9   0]
 [  0   0  32   1   9   0]
 [  0   0   3   1   1   0]]
Classification report:
               precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00        10
         5.0       0.43      0.92      0.58       130
         6.0       0.25      0.02      0.04       132
         7.0       0.31      0.21      0.25        42
         8.0       0.00      0.00      0.00         5

    accuracy                           0.41       320
   macro avg       0.16      0.19      0.15       320
weighted avg       0.32      0.41      0.29       320



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
