In [1]:
# Remember type conda activate test_env_gpu
#Library Part
import csv
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import GridSearchCV
import cupy as cp
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from tqdm import tqdm

####  Access the file

In [2]:
# Get the path of the file
# Current directiory for Jupyter
script_dir = os.getcwd() 

# Check the path whether correct 
print(script_dir)

#Buld corresponding path
file_path = os.path.join(script_dir, "Data")
Original_File_path = os.path.join(file_path, "Original_Data.csv")
Data_Without_Outlier_Path = os.path.join(file_path, "Data_Without_Outlier.csv")

# Test whether path correct
print("Original File Path:", Original_File_path)
print("Data Without Outlier Path:", Data_Without_Outlier_Path)


# Read the csv files
Original_Data = pd.read_csv(Original_File_path,encoding='utf-8')
Data_Without_Outlier = pd.read_csv(Data_Without_Outlier_Path,encoding='utf-8')

# Check the csv file
print(Original_Data.head())
print(Data_Without_Outlier.head())

C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417
Original File Path: C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417\Data\Original_Data.csv
Data Without Outlier Path: C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417\Data\Data_Without_Outlier.csv
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0             

#### Process the the Data_Without_Outlier data

In [None]:
# Set the target and features vector for Data_Without_Outlier
outlier_x = Data_Without_Outlier.drop(['quality'], axis=1)
outlier_y = Data_Without_Outlier['quality']

# Split data into training and test sets
Outlier_x_train, Outlier_x_test, Outlier_y_train, Outlier_y_test = train_test_split(
    outlier_x, outlier_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {Outlier_x_train.shape}")
print(f"Testing data shape: {Outlier_x_test.shape}")

# Normalization data
scaler = StandardScaler()
Outlier_x_train_scaled = scaler.fit_transform(Outlier_x_train)
Outlier_x_test_scaled = scaler.transform(Outlier_x_test)

# Initial the parameter range
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'gamma': [1, 0.1, 'scale', 'auto']
}
# Find the best hyper parameter
gs = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=3)

# Trainning model 
gs.fit(Outlier_x_train_scaled, Outlier_y_train)

# Get the best parameter depend on trainning set
print("The best parameter we got: ", gs.best_params_)

# The best crossvalidation score depend on the training set
print("The best parameter we got: ", gs.best_score_)

# We apply the best parameter for train model and get the data
updated_model = SVC(**gs.best_params_)
updated_model.fit(Outlier_x_train_scaled, Outlier_y_train)

# Predict the result based on updated model 
final_prediction = updated_model.predict(Outlier_x_test_scaled)

# Evaluation the performance of updated model
print("/n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, final_prediction)
f1_score = f1_score(Outlier_y_test, final_prediction, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, final_prediction)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_score)
print("The confusion_matrix of the updated model is:/n ", cf_matrix)

#  the classification report 
print(classification_report(Outlier_y_test, best_predictions))

Training data shape: (1279, 11)
Testing data shape: (320, 11)
Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.574 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.586 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.586 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.629 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.678 total time=   0.0s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.430 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.434 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.430 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.430 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.431 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.492 tota

#### Process the the Original data

In [None]:
# Set the target and features vector for Data_Without_Outlier
original_x = Original_Data.drop(['quality'], axis=1)
original_y = Original_Data['quality']

# Split data into training and test sets
original_x_train, original_x_test, original_y_train, original_y_test = train_test_split(
    original_x, original_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {original_x_train.shape}")
print(f"Testing data shape: {original_x_test.shape}")

# Normalization data
scaler = StandardScaler()
original_x_train_scaled = scaler.fit_transform(original_x_train)
original_x_test_scaled = scaler.transform(original_x_test)

# Initial the parameter range
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'gamma': [1, 0.1, 'scale', 'auto']
}
# Find the best hyper parameter
gs = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=3)

# Trainning model 
gs.fit(original_x_train_scaled, original_y_train)

# Get the best parameter depend on trainning set
print("The best parameter we got: ", gs.best_params_)

# The best crossvalidation score depend on the training set
print("The best parameter we got: ", gs.best_score_)

# We apply the best parameter for train model and get the data
updated_model = SVC(**gs.best_params_)
updated_model.fit(original_x_train_scaled, original_y_train)

# Predict the result based on updated model 
final_prediction = updated_model.predict(original_x_test_scaled)

# Evaluation the performance of updated model
print("/n --- Evaluation under best parameters ---")
accuracy = accuracy_score(original_y_test, final_prediction)
f1_score = f1_score(original_y_test, final_prediction, average='weighted')
cf_matrix = confusion_matrix(original_y_test, final_prediction)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1_score)
print("The confusion_matrix of the updated model is:/n ", cf_matrix)

#  the classification report 
print(classification_report(original_y_test, best_predictions))