<a href="https://colab.research.google.com/github/Hero0963/dscamp_prediction-of-quality-of-wine/blob/main/dscamp_lv2_prediction_of_quality_of_wine_finale.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#專題實作 #02：紅酒品質分類預測

接下來有幾個任務請你完成，將答案以文字或註解方式補充於程式碼中：

1. 請嘗試使用 scikit-learn 中「不同基本分類模型」，並且進行比較結果？


2. 可以利用「sklearn.model_selection 下的 GridSearchCV(...)」進行參數的調整。


3. 最後也請嘗試看看除了 scikit-learn 之外的進階模型套件，例如 pytorch、tensorflow 或 xgboost。


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv')

In [None]:
features = df.drop('quality', axis=1)
labels = df['quality']

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = features
y = labels

In [None]:
model = RandomForestClassifier()
model.fit(X, y)

feature_importance = model.feature_importances_

importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
importance_df = importance_df.sort_values('Importance', ascending=False)

print(importance_df)

                 Feature  Importance
10               alcohol    0.145243
9              sulphates    0.111257
6   total sulfur dioxide    0.107677
1       volatile acidity    0.103896
7                density    0.091926
4              chlorides    0.081266
8                     pH    0.075992
2            citric acid    0.074448
0          fixed acidity    0.072347
3         residual sugar    0.070891
5    free sulfur dioxide    0.065056


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


# 標準化
scaler = StandardScaler()
features_normalized_standardized  = features.copy()
features_normalized_standardized = scaler.fit_transform(features_normalized_standardized)

# 最小最大化正規化
scaler = MinMaxScaler()
features_normalized__minmax  = features.copy()
features_normalized__minmax =  scaler.fit_transform(features_normalized__minmax)


display(features_normalized_standardized)
display(features_normalized__minmax)

array([[-0.52835961,  0.96187667, -1.39147228, ...,  1.28864292,
        -0.57920652, -0.96024611],
       [-0.29854743,  1.96744245, -1.39147228, ..., -0.7199333 ,
         0.1289504 , -0.58477711],
       [-0.29854743,  1.29706527, -1.18607043, ..., -0.33117661,
        -0.04808883, -0.58477711],
       ...,
       [-1.1603431 , -0.09955388, -0.72391627, ...,  0.70550789,
         0.54204194,  0.54162988],
       [-1.39015528,  0.65462046, -0.77526673, ...,  1.6773996 ,
         0.30598963, -0.20930812],
       [-1.33270223, -1.21684919,  1.02199944, ...,  0.51112954,
         0.01092425,  0.54162988]])

array([[0.24778761, 0.39726027, 0.        , ..., 0.60629921, 0.13772455,
        0.15384615],
       [0.28318584, 0.52054795, 0.        , ..., 0.36220472, 0.20958084,
        0.21538462],
       [0.28318584, 0.43835616, 0.04      , ..., 0.40944882, 0.19161677,
        0.21538462],
       ...,
       [0.15044248, 0.26712329, 0.13      , ..., 0.53543307, 0.25149701,
        0.4       ],
       [0.11504425, 0.35958904, 0.12      , ..., 0.65354331, 0.22754491,
        0.27692308],
       [0.12389381, 0.13013699, 0.47      , ..., 0.51181102, 0.19760479,
        0.4       ]])

In [None]:
features_normalized_standardized = pd.DataFrame(features_normalized_standardized, columns=features.columns)
print(type(features_normalized_standardized))

<class 'pandas.core.frame.DataFrame'>


In [None]:
from sklearn.cluster import KMeans

# 選擇特徵子集
excluded_features = ['residual sugar', 'free sulfur dioxide']
selected_features = [f for f in features.columns if f not in excluded_features]

# 資料預處理

# 假設你的資料集為 df，進行特徵縮放等必要的預處理步驟


n_clusters = len(df['quality'].unique())

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(features_normalized_standardized[selected_features].values)

# 獲取聚類結果
cluster_labels = kmeans.labels_

# 新特徵的命名
new_feature_name = 'cluster_label'

# 將聚類結果作為新特徵添加到原始資料集
features_normalized_standardized[new_feature_name] = cluster_labels




In [None]:
excluded_features = ['residual sugar', 'free sulfur dioxide', 'cluster_label']
selected_features = [f for f in features.columns if f not in excluded_features]

X = features_normalized_standardized[selected_features]
y = labels

In [None]:
display(labels)

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64

In [None]:
display(type(labels))

pandas.core.series.Series

In [None]:
print(labels.unique())

[5 6 7 4 8 3]


In [None]:
def convert_label(value):
    if value <= 6.5:
        return "bad"
    else:
        return "good"

converted_labels = labels.apply(convert_label)
print(converted_labels)

0       bad
1       bad
2       bad
3       bad
4       bad
       ... 
1594    bad
1595    bad
1596    bad
1597    bad
1598    bad
Name: quality, Length: 1599, dtype: object


In [None]:
from sklearn.preprocessing import LabelEncoder
label_quality = LabelEncoder()
converted_labels = label_quality.fit_transform(converted_labels)
display(converted_labels)

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
X = features_normalized_standardized[selected_features]
y = converted_labels


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 43, stratify=y)

models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    KNeighborsClassifier(),
    GaussianNB()
]

In [None]:
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    display(f"{model.__class__.__name__} accuracy：{accuracy}")


'LogisticRegression accuracy：0.8625'

'DecisionTreeClassifier accuracy：0.84375'

'RandomForestClassifier accuracy：0.884375'

'SVC accuracy：0.88125'

'KNeighborsClassifier accuracy：0.8875'

'GaussianNB accuracy：0.84375'

In [None]:
from sklearn.model_selection import GridSearchCV

# 定義要調整的參數範圍
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

param_grid_dt = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 0.01]
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}

param_grid_nb = {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
}

# 定義模型名稱列表
model_names = [model.__class__.__name__ for model in models]

# 定義參數範圍字典
param_grids = {
    'LogisticRegression': param_grid_lr,
    'DecisionTreeClassifier': param_grid_dt,
    'RandomForestClassifier': param_grid_rf,
    'SVC': param_grid_svc,
    'KNeighborsClassifier': param_grid_knn,
    'GaussianNB': param_grid_nb
}

# 對每個模型進行參數調整
for model, model_name in zip(models, model_names):
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)

    print(f"{model_name}:")
    print("最佳參數組合：", grid_search.best_params_)
    print("最佳分數：", grid_search.best_score_)
    print()

15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



LogisticRegression:
最佳參數組合： {'C': 0.1, 'penalty': 'l2'}
最佳分數： 0.8674275078369906

DecisionTreeClassifier:
最佳參數組合： {'max_depth': 3, 'min_samples_split': 2}
最佳分數： 0.8592672413793103

RandomForestClassifier:
最佳參數組合： {'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 300}
最佳分數： 0.878064263322884

SVC:
最佳參數組合： {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
最佳分數： 0.8692985893416928

KNeighborsClassifier:
最佳參數組合： {'n_neighbors': 5, 'weights': 'distance'}
最佳分數： 0.8611677115987462

GaussianNB:
最佳參數組合： {'var_smoothing': 1e-09}
最佳分數： 0.8111794670846395



In [None]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import xgboost as xgb


models.append(xgb.XGBClassifier())


In [None]:
param_grid_xgb = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.01, 0.001]
}

param_grids['XGBClassifier'] = param_grid_xgb

In [None]:
for model in models:
    model_name = model.__class__.__name__

    if model_name != 'XGBClassifier':
        continue

    param_grid = param_grid_xgb

    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    print(f"Best parameters for {model_name}:")
    print(best_params)
    print()

Best parameters for XGBClassifier:
{'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 300}

