In [10]:
# Remember type conda activate test_env_gpu
#Library Part
import csv
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import GridSearchCV
import cupy as cp
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from tqdm import tqdm

In [11]:
# Get the path of the file
# Current directiory for Jupyter
script_dir = os.getcwd() 

# Check the path whether correct 
print(script_dir)

#Buld corresponding path
file_path = os.path.join(script_dir, "Data")
Original_File_path = os.path.join(file_path, "Original_Data.csv")
Data_Without_Outlier_Path = os.path.join(file_path, "Data_Without_Outlier.csv")

# Test whether path correct
print("Original File Path:", Original_File_path)
print("Data Without Outlier Path:", Data_Without_Outlier_Path)


# Read the csv files
Original_Data = pd.read_csv(Original_File_path,encoding='utf-8')
Data_Without_Outlier = pd.read_csv(Data_Without_Outlier_Path,encoding='utf-8')

# Check the csv file
print(Original_Data.head())
print(Data_Without_Outlier.head())

C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417
Original File Path: C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417\Data\Original_Data.csv
Data Without Outlier Path: C:\Users\judyw\Desktop\ESE417_Final_Project\WashU\ESE_417\Data\Data_Without_Outlier.csv
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0             

In [None]:
# Do the pca process for data
# Set the target and features vector for Data_Without_Outlier
original_x = Original_Data.drop(['quality'], axis=1)
original_y = Original_Data['quality']

# Split data into training and test sets
original_x_train, original_x_test, original_y_train, original_y_test = train_test_split(
    original_x, original_y, test_size=0.2, random_state=42
)

# Check training and testing data shapes
print(f"Training data shape: {original_x_train.shape}")
print(f"Testing data shape: {original_x_test.shape}")

# Normalization data
scaler = StandardScaler()
original_x_train_scaled = scaler.fit_transform(original_x_train)
original_x_test_scaled = scaler.transform(original_x_test)
# Do the pca process for data

# Call the pca method from packages
# Keep all components
pca = PCA()
pca_process = pca.fit(original_x_train_scaled)

# Use the explained variance 
ev_result = pca.explained_variance_ratio_
#print("featire {i}  ---->    ratio: {}")
print("Exlpained variance raitor")

# Get the sum of explained variance -> cumulative variance 
cv_result = pca.explained_variance_ratio_.cumsum()
# decide how much feature contains
threshold = 0.9
n_features = (cv_result >= threshold).argmax() + 1
print("HOW MANY COMPONENTS KEEPS: ", n_features)
# Update new pca with limited components
pca = PCA(n_components=n_features)
X_reduced = pca.fit_transform(original_x_train_scaled)
print("Find which components has been kept:\n ", pca.components_)
top_features = np.argsort(np.abs(pca.components_), axis=1)[:, ::-1]
print("Find which components has been kept:(Index)\n ", top_features)

# transform the data depend on the  pca
pca.fit(original_x_train_scaled)
original_x_train_scaled_t = pca.transform(original_x_train_scaled)
original_x_test_scaled_t = pca.transform(original_x_test_scaled)


# Initial the parameter range
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'gamma': [1, 0.1, 'scale', 'auto']
}
# Find the best hyper parameter
gs = GridSearchCV(SVC(), param_grid, cv=5, scoring='f1_weighted', verbose=3)

# Trainning model 
gs.fit(original_x_train_scaled_t, original_y_train)

# Get the best parameter depend on trainning set
print("The best parameter we got: ", gs.best_params_)

# The best crossvalidation score depend on the training set
print("The best parameter we got: ", gs.best_score_)

# We apply the best parameter for train model and get the data
updated_model = SVC(**gs.best_params_)
updated_model.fit(original_x_train_scaled_t, original_y_train)

# Predict the result based on updated model 
final_prediction = updated_model.predict(original_x_test_scaled_t)

# Evaluation the performance of updated model
print("\n --- Evaluation under best parameters ---")
accuracy = accuracy_score(Outlier_y_test, final_prediction)
f1 = f1_score(Outlier_y_test, final_prediction, average='weighted')
cf_matrix = confusion_matrix(Outlier_y_test, final_prediction)
print("The accuracy of the updated model is: ", accuracy)
print("The f1 score of the updated model is: ", f1)
print("The confusion_matrix of the updated model is:\n", cf_matrix)

# Classification report
print("Classification report:\n", classification_report(Outlier_y_test, final_prediction))




Training data shape: (1279, 11)
Testing data shape: (320, 11)
Exlpained variance raitor
HOW MANY COMPONENTS KEEPS:  7
Find which components has been kept:
  [[ 0.48681861 -0.23069485  0.4673691   0.15233568  0.21629492 -0.03599716
   0.02460332  0.39321441 -0.43845363  0.24519354 -0.11173119]
 [-0.10928945  0.2933553  -0.15739982  0.23918196  0.14946568  0.50595079
   0.56224062  0.23712774 -0.0012195  -0.03871665 -0.40968894]
 [ 0.15163968  0.44260833 -0.22098423 -0.04924869  0.02177039 -0.44175911
  -0.33372421  0.34471498 -0.05907731 -0.33838548 -0.43130457]
 [-0.21199618  0.12818339 -0.09764578 -0.40605566  0.66383339 -0.09985778
  -0.07071603 -0.17256668 -0.04917505  0.49538895 -0.17594314]
 [-0.11695598  0.20440682 -0.05566935  0.72241156  0.28644568 -0.12773764
  -0.22131848  0.15789213  0.2837971   0.23804604  0.33177563]
 [-0.15168274 -0.4766169  -0.06769707 -0.04030962 -0.23462024 -0.01959369
  -0.14006669  0.36856072  0.49686813  0.33109514 -0.41980082]
 [ 0.33464166  0.4684