In [3]:
# Remember type conda activate test_env_gpu
#Library Part
import csv
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import GridSearchCV
import cupy as cp
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from tqdm import tqdm

####  Access the file

In [4]:
# Get the path of the file
# Current directiory for Jupyter
script_dir = os.getcwd() 

# Check the path whether correct 
print(script_dir)

#Buld corresponding path
file_path = os.path.join(script_dir, "Data")
Original_File_path = os.path.join(file_path, "Original_Data.csv")
Data_Without_Outlier_Path = os.path.join(file_path, "Data_Without_Outlier.csv")

# Test whether path correct
print("Original File Path:", Original_File_path)
print("Data Without Outlier Path:", Data_Without_Outlier_Path)


# Read the csv files
Original_Data = pd.read_csv(Original_File_path,encoding='utf-8')
Data_Without_Outlier = pd.read_csv(Data_Without_Outlier_Path,encoding='utf-8')

# Check the csv file
print(Original_Data.head())
print(Data_Without_Outlier.head())

C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417
Original File Path: C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417\Data\Original_Data.csv
Data Without Outlier Path: C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417\Data\Data_Without_Outlier.csv
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0             

#### Process the the Data_Without_Outlier data

In [3]:
# Set the target and features vector for Data_Without_Outlier
outlier_x = Data_Without_Outlier.drop(['quality'], axis=1)
outlier_y = Data_Without_Outlier['quality']

# Split data into training and test sets
Outlier_x_train, Outlier_x_test, Outlier_y_train, Outlier_y_test = train_test_split(
    outlier_x, outlier_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {Outlier_x_train.shape}")
print(f"Testing data shape: {Outlier_x_test.shape}")

# Normalization data
scaler = StandardScaler()
Outlier_x_train_scaled = scaler.fit_transform(Outlier_x_train)
Outlier_x_test_scaled = scaler.transform(Outlier_x_test)

# Initial the parameter range
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'gamma': [1, 0.1, 'scale', 'auto'],
    'class_weight': ['balanced'],                     # because the quality distributionn is un-uniform
    'decision_function_shape': ['ovr'],               # because multi class
    'break_ties': [True]   
}
# Find the best hyper parameter
gs = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=3)

# Trainning model 
gs.fit(Outlier_x_train_scaled, Outlier_y_train)

# Get the best parameter depend on trainning set
print("The best parameter we got: ", gs.best_params_)

# The best crossvalidation score depend on the training set
print("The best parameter we got: ", gs.best_score_)

# We apply the best parameter for train model and get the data
updated_model = SVC(**gs.best_params_)
updated_model.fit(Outlier_x_train_scaled, Outlier_y_train)

# Predict the result based on updated model 
final_prediction = updated_model.predict(Outlier_x_test_scaled)

# Evaluation the performance of updated model
print("/n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, final_prediction)
f1_score = f1_score(Outlier_y_test, final_prediction, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, final_prediction)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_score)
print("The confusion_matrix of the updated model is:/n ", cf_matrix)

#  the classification report 
print(classification_report(Outlier_y_test, final_prediction))

Training data shape: (1279, 11)
Testing data shape: (320, 11)
Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.434 total time=   0.0s
[CV 2/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.418 total time=   0.0s
[CV 3/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.410 total time=   0.0s
[CV 4/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.504 total time=   0.0s
[CV 5/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.490 total time=   0.0s
[CV 1/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=rbf;, score=0.129 total time=   0.0s
[CV 2/5] 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Process the the Original data

In [6]:
# Set the target and features vector for Data_Without_Outlier
original_x = Original_Data.drop(['quality'], axis=1)
original_y = Original_Data['quality']

# Split data into training and test sets
original_x_train, original_x_test, original_y_train, original_y_test = train_test_split(
    original_x, original_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {original_x_train.shape}")
print(f"Testing data shape: {original_x_test.shape}")

# Normalization data
scaler = StandardScaler()
original_x_train_scaled = scaler.fit_transform(original_x_train)
original_x_test_scaled = scaler.transform(original_x_test)

# Initial the parameter range
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'gamma': [1, 0.1, 'scale', 'auto'],
    'class_weight': ['balanced'],                     # because the quality distributionn is un-uniform
    'decision_function_shape': ['ovr'],               # because multi class
    'break_ties': [True]   
}
# Find the best hyper parameter
gs = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=3)

# Trainning model 
gs.fit(original_x_train_scaled, original_y_train)

# Get the best parameter depend on trainning set
print("The best parameter we got: ", gs.best_params_)

# The best crossvalidation score depend on the training set
print("The best parameter we got: ", gs.best_score_)

# We apply the best parameter for train model and get the data
updated_model = SVC(**gs.best_params_)
updated_model.fit(original_x_train_scaled, original_y_train)

# Predict the result based on updated model 
final_prediction = updated_model.predict(original_x_test_scaled)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(original_y_test, final_prediction)
f1_orig = f1_score(original_y_test, final_prediction, average='weighted')  # 修改变量名避免冲突
cf_matrix = confusion_matrix(original_y_test, final_prediction)

# 输出评估结果
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_orig)
print("The confusion_matrix of the updated model is:\n", cf_matrix)

# 分类报告
print("Classification report:\n", classification_report(original_y_test, final_prediction))

#  the classification report 
print("Classification report:\n", classification_report(original_y_test, final_prediction))


Training data shape: (1279, 11)
Testing data shape: (320, 11)
Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.367 total time=   0.0s
[CV 2/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.379 total time=   0.0s
[CV 3/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.332 total time=   0.0s
[CV 4/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.449 total time=   0.0s
[CV 5/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=linear;, score=0.467 total time=   0.0s
[CV 1/5] END C=0.1, break_ties=True, class_weight=balanced, decision_function_shape=ovr, gamma=1, kernel=rbf;, score=0.188 total time=   0.0s
[CV 2/5] 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# Set the target and features vector for Data_Without_Outlier
outlier_x = Data_Without_Outlier.drop(['quality'], axis=1)
outlier_y = Data_Without_Outlier['quality']

# Split data into training and test sets
Outlier_x_train, Outlier_x_test, Outlier_y_train, Outlier_y_test = train_test_split(
    outlier_x, outlier_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {Outlier_x_train.shape}")
print(f"Testing data shape: {Outlier_x_test.shape}")

# Normalization data
scaler = StandardScaler()
Outlier_x_train_scaled = scaler.fit_transform(Outlier_x_train)
Outlier_x_test_scaled = scaler.transform(Outlier_x_test)
# Do the pca process for data

# Call the pca method from packages
# Keep all components
pca = PCA()
pca.fit(Outlier_x_train_scaled)

# Use the explained variance 
ev_result = pca.explained_variance_ratio_
#print("featire {i}  ---->    ratio: {}")
print("Exlpained variance raitor")

# Get the sum of explained variance -> cumulative variance 
cv_result = pca.explained_variance_ratio_.cumsum()
# decide how much feature contains
threshold = 0.9
n_features = (cv_result >= threshold).argmax() + 1
print("HOW MANY COMPONENTS KEEPS: ", n_features)
# Update new pca with limited components
pca = PCA(n_components=n_features)
pca.fit(Outlier_x_train_scaled)

print("Find which components have been kept:\n", pca.components_)
top_features = np.argsort(np.abs(pca.components_), axis=1)[:, ::-1]
print("Find which components has been kept:(Index)\n ", top_features)

Outlier_x_train_scaled_t = pca.transform(Outlier_x_train_scaled)
Outlier_x_test_scaled_t = pca.transform(Outlier_x_test_scaled)


# Initial the parameter range
# Initial the parameter range
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'gamma': [1, 0.1, 'scale', 'auto'],
    'class_weight': ['balanced'],                     # because the quality distributionn is un-uniform
    'decision_function_shape': ['ovr'],               # because multi class
    'break_ties': [True]   
}
# Find the best hyper parameter
gs = GridSearchCV(SVC(), param_grid, cv=5, scoring='f1_weighted', verbose=3)

# Trainning model 
gs.fit(Outlier_x_train_scaled, Outlier_y_train)

# Get the best parameter depend on trainning set
print("The best parameter we got: ", gs.best_params_)

# The best crossvalidation score depend on the training set
print("The best parameter we got: ", gs.best_score_)

# We apply the best parameter for train model and get the data
updated_model = SVC(**gs.best_params_)
updated_model.fit(Outlier_x_train_scaled_t, Outlier_y_train)

# Predict the result based on updated model 
final_prediction = updated_model.predict(Outlier_x_test_scaled_t)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, final_prediction)
f1_pca = f1_score(Outlier_y_test, final_prediction, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, final_prediction)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_pca)
print("The confusion_matrix of the updated model is:\n", cf_matrix)

# Classification report
print("Classification report:\n", classification_report(Outlier_y_test, final_prediction))




Training data shape: (1279, 11)
Testing data shape: (320, 11)
Exlpained variance raitor
HOW MANY COMPONENTS KEEPS:  7
Find which components have been kept:
 [[ 0.50349721 -0.21946622  0.45736688  0.18873163  0.22226281 -0.06192714
   0.01015142  0.41207779 -0.41950186  0.21187714 -0.09747018]
 [ 0.05316909 -0.39642069  0.2014572  -0.1420794  -0.30136773 -0.32592056
  -0.44747557 -0.29671099  0.0218534   0.2406422   0.48533222]
 [-0.09817501 -0.33379644  0.18198616  0.0553366  -0.1966896   0.61884599
   0.51866334 -0.16619321  0.02964326  0.28461977  0.20635978]
 [-0.04327114  0.20349882 -0.0522774   0.78619684  0.09014402 -0.03680007
  -0.0654324   0.16713219  0.32061877  0.12638839  0.41882246]
 [-0.18276764  0.02665526 -0.11708226 -0.26108443  0.55357957 -0.0096233
  -0.0917904   0.02379509  0.21890098  0.71659172 -0.08400105]
 [-0.03371095  0.14888509  0.11775918 -0.01577243  0.59449215  0.03862248
   0.08566158 -0.46572821 -0.37528231 -0.27001498  0.41192551]
 [ 0.30657756  0.64760

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
