In [3]:
import pandas as pd
from sklearn import preprocessing

# Đọc dữ liệu wine.csv
duLieu = pd.read_csv('Wine.csv')

print("5 dòng đầu:")
print(duLieu.head())
print("\nTên cột:")
print(duLieu.columns)


5 dòng đầu:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0           11.6             0.580         0.66            2.20      0.074   
1           10.4             0.610         0.49            2.10      0.200   
2            7.4             1.185         0.00            4.25      0.097   
3           10.4             0.440         0.42            1.50      0.145   
4            8.3             1.020         0.02            3.40      0.084   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                   10                    47  1.00080  3.25       0.57   
1                    5                    16  0.99940  3.16       0.63   
2                    5                    14  0.99660  3.63       0.54   
3                   34                    48  0.99832  3.38       0.86   
4                    6                    11  0.99892  3.48       0.49   

   alcohol   class  
0      9.0  Loai 2  
1      8.4  Loai 2  
2     10.7 

Điều chỉnh tỉ lệ (Rescale Data – MinMaxScaler)

In [4]:
from sklearn.preprocessing import MinMaxScaler

X = duLieu.select_dtypes(include=['float64', 'int64'])  # chỉ lấy cột số

scaler = MinMaxScaler(feature_range=(0,1))
X_rescaled = scaler.fit_transform(X)

df_rescaled = pd.DataFrame(X_rescaled, columns=X.columns)
print("\n[4.1] Rescale Data (0–1):")
print(df_rescaled.head())



[4.1] Rescale Data (0–1):
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       0.619469          0.315068         0.66        0.089041   0.103506   
1       0.513274          0.335616         0.49        0.082192   0.313856   
2       0.247788          0.729452         0.00        0.229452   0.141903   
3       0.513274          0.219178         0.42        0.041096   0.222037   
4       0.327434          0.616438         0.02        0.171233   0.120200   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0             0.126761              0.144876  0.787812  0.401575   0.143713   
1             0.056338              0.035336  0.685022  0.330709   0.179641   
2             0.056338              0.028269  0.479442  0.700787   0.125749   
3             0.464789              0.148410  0.605727  0.503937   0.317365   
4             0.070423              0.017668  0.649780  0.582677   0.095808   

    alcohol  
0  0.092308  
1

Chuẩn hóa dữ liệu (Standardize Data – StandardScaler)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler()
X_standardized = scaler_std.fit_transform(X)

df_standardized = pd.DataFrame(X_standardized, columns=X.columns)
print("\n[4.2] Standardized Data (mean≈0, std≈1):")
print(df_standardized.head())



[4.2] Standardized Data (mean≈0, std≈1):
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       1.884668          0.291499     1.997658       -0.240375  -0.286214   
1       1.195232          0.459094     1.124700       -0.311323   2.391755   
2      -0.528360          3.671318    -1.391472        1.214055   0.202621   
3       1.195232         -0.490607     0.765247       -0.737010   1.222800   
4      -0.011282          2.749549    -1.288771        0.610998  -0.073677   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0            -0.561868              0.016165  2.148316 -0.395969  -0.520193   
1            -1.040009             -0.926496  1.406296 -0.979104  -0.166115   
2            -1.040009             -0.987313 -0.077742  2.066156  -0.697233   
3             1.733209              0.046573  0.833882  0.446337   1.191186   
4            -0.944381             -1.078538  1.151890  1.094265  -0.992298   

    alcohol  


→ Sau khi chuẩn hóa, mỗi cột có mean≈0 và std≈1.

Bình thường hóa dữ liệu (Normalize Data – theo hàng)

In [7]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer(norm='l2')
X_normalized = normalizer.fit_transform(X)

df_normalized = pd.DataFrame(X_normalized, columns=X.columns)
print("\n[4.3] Normalized Data (L2=1 cho mỗi hàng):")
print(df_normalized.head())



[4.3] Normalized Data (L2=1 cho mỗi hàng):
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       0.230072          0.011504     0.013090        0.043634   0.001468   
1       0.476601          0.027955     0.022455        0.096237   0.009165   
2       0.359296          0.057536     0.000000        0.206353   0.004710   
3       0.171393          0.007251     0.006922        0.024720   0.002390   
4       0.429846          0.052824     0.001036        0.176082   0.004350   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0             0.198338              0.932190  0.019850  0.064460   0.011305   
1             0.229135              0.733233  0.045800  0.144813   0.028871   
2             0.242768              0.679750  0.048388  0.176249   0.026219   
3             0.560323              0.791045  0.016452  0.055703   0.014173   
4             0.310732              0.569676  0.051733  0.180225   0.025376   

    alcohol 

→ Mỗi hàng (mẫu) được đưa về độ dài vector = 1.

4.4 Số hóa dữ liệu (Digitalization – Label/OneHot Encoding)

In [6]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

# Nếu có cột phân loại, ví dụ 'Class'
if 'Class' in duLieu.columns:
    y = duLieu['Class']

    # Label Encoding
    le = LabelEncoder()
    y_label = le.fit_transform(y)
    print("\n[4.4] LabelEncoder:")
    print(y_label[:10])

    # One-Hot Encoding
    ohe = OneHotEncoder(sparse_output=False)
    y_onehot = ohe.fit_transform(np.array(y_label).reshape(-1,1))
    print("\n[4.4] OneHotEncoder (first 5 rows):")
    print(y_onehot[:5])
else:
    print("\nKhông có cột phân loại để số hóa trong file wine.csv.")



Không có cột phân loại để số hóa trong file wine.csv.


In [9]:
# ===============================
# CHUẨN HÓA DỮ LIỆU WINE.CSV
# ===============================
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, LabelEncoder, OneHotEncoder
import numpy as np

# Đọc dữ liệu
duLieu = pd.read_csv('Wine.csv')
print("5 dòng đầu:")
print(duLieu.head())

# -------------------------------
# Tách phần đặc trưng và nhãn
# -------------------------------
X = duLieu.drop('class', axis=1)
y = duLieu['class']

# ===============================
# 4.1. Điều chỉnh tỉ lệ (Rescale Data)
# ===============================
scaler_minmax = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler_minmax.fit_transform(X)
df_rescaled = pd.DataFrame(X_rescaled, columns=X.columns)
print("\n[4.1] Sau khi Rescale (MinMax):")
print(df_rescaled.describe().T[['min', 'max']])

# ===============================
# 4.2. Chuẩn hóa dữ liệu (Standardize Data)
# ===============================
scaler_std = StandardScaler()
X_standardized = scaler_std.fit_transform(X)
df_standardized = pd.DataFrame(X_standardized, columns=X.columns)
print("\n[4.2] Sau khi Standardize (Z-score):")
print(df_standardized.describe().T[['mean', 'std']])

# ===============================
# 4.3. Bình thường hóa dữ liệu (Normalize Data)
# ===============================
normalizer = Normalizer(norm='l2')
X_normalized = normalizer.fit_transform(X)
df_normalized = pd.DataFrame(X_normalized, columns=X.columns)
print("\n[4.3] Sau khi Normalize (L2=1 cho mỗi hàng):")
print(df_normalized.iloc[:5, :5])  # in 5 cột đầu 5 dòng đầu

# ===============================
# 4.4. Số hóa dữ liệu (Digitalization)
# ===============================
# a) Label Encoding
le = LabelEncoder()
y_label = le.fit_transform(y)
print("\n[4.4.a] Label Encoding:")
print("Class gốc → số:", dict(zip(le.classes_, le.transform(le.classes_))))
print("10 giá trị đầu:", y_label[:10])

# b) One-Hot Encoding
ohe = OneHotEncoder(sparse_output=False)
y_onehot = ohe.fit_transform(y_label.reshape(-1, 1))
print("\n[4.4.b] One-Hot Encoding:")
print("Kích thước ma trận:", y_onehot.shape)
print("5 hàng đầu:\n", y_onehot[:5])

# ===============================
# Tóm tắt
# ===============================
print("\nTÓM TẮT:")
print("- 4.1: Rescale → dữ liệu nằm trong [0,1]")
print("- 4.2: Standardize → mean≈0, std≈1")
print("- 4.3: Normalize → mỗi hàng có độ dài = 1")
print("- 4.4: Digitalization → chuyển Class sang số (Label/OneHot)")


5 dòng đầu:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0           11.6             0.580         0.66            2.20      0.074   
1           10.4             0.610         0.49            2.10      0.200   
2            7.4             1.185         0.00            4.25      0.097   
3           10.4             0.440         0.42            1.50      0.145   
4            8.3             1.020         0.02            3.40      0.084   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                   10                    47  1.00080  3.25       0.57   
1                    5                    16  0.99940  3.16       0.63   
2                    5                    14  0.99660  3.63       0.54   
3                   34                    48  0.99832  3.38       0.86   
4                    6                    11  0.99892  3.48       0.49   

   alcohol   class  
0      9.0  Loai 2  
1      8.4  Loai 2  
2     10.7 