<a href="https://colab.research.google.com/github/NguyenSang2003/Machine-Learning/blob/main/Lab_8_21130512_NguyenVanSang.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The main aim of this lab is to deal with the **pipeline** technique and **MultilayerPerceptron** algorithm

*   **Deadline: 23:59, 06/5/2024**



# Import libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from prettytable import PrettyTable
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.datasets import fetch_openml
from sklearn.neural_network import MLPClassifier

#Task 1. With **iris** dataset
*  Apply **pipeline** including preprocessing steps (i.e., **StandardScaler**, **SimpleImputer**, **feature selection**, **KBinsDiscretizer**, …) and classification algorithms (i.e., **Random forest, kNN, Naïve Bayes**).


In [2]:
# Load dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Chia dữ liệu thành tập train và tập test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Xây dựng Pipeline
pipeline_rf = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Điền giá trị thiếu bằng mean
    ('scaler', StandardScaler()),  # Chuẩn hóa dữ liệu
    ('selector', SelectKBest(score_func=f_classif, k=2)),  # Chọn ra 2 đặc trưng tốt nhất
    ('classifier', RandomForestClassifier())  # Mô hình Random Forest
])

pipeline_knn = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Điền giá trị thiếu bằng median
    ('scaler', StandardScaler()),  # Chuẩn hóa dữ liệu
    ('discretizer', KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')),  # Phân loại đặc trưng
    ('classifier', KNeighborsClassifier())  # Mô hình kNN
])

pipeline_nb = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Điền giá trị thiếu bằng mode
    ('classifier', GaussianNB())  # Mô hình Naïve Bayes
])

# Huấn luyện và đánh giá mô hình Random Forest
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy Random Forest:", accuracy_rf)

# Huấn luyện và đánh giá mô hình kNN
pipeline_knn.fit(X_train, y_train)
y_pred_knn = pipeline_knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy kNN:", accuracy_knn)

# Huấn luyện và đánh giá mô hình Naïve Bayes
pipeline_nb.fit(X_train, y_train)
y_pred_nb = pipeline_nb.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Accuracy Naïve Bayes:", accuracy_nb)

Accuracy Random Forest: 1.0
Accuracy kNN: 1.0
Accuracy Naïve Bayes: 1.0


#Task 2. With **fashion** dataset
*   2.1. Apply **MultilayerPerceptron** classification with 1 hidden layer
having 10 nodes

In [9]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Data_Labs_ML/Lab5'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Data_Labs_ML/Lab5


In [5]:
# Đọc dữ liệu từ cả hai tệp CSV
train_data = pd.read_csv("fashion_train.csv")
test_data = pd.read_csv("fashion_test.csv")

# Kết hợp dữ liệu từ hai tập dữ liệu
fashion_df = pd.concat([train_data, test_data], axis=0)

# Chia dữ liệu thành features (X) và labels (y)
X = fashion_df.drop('y', axis=1)
y = fashion_df['y']

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Áp dụng Multilayer Perceptron classification với 1 hidden layer có 10 nodes
clf = MLPClassifier(hidden_layer_sizes=(10,), random_state=42)
clf.fit(X_train, y_train)

# Dự đoán và đánh giá mô hình
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.1975


*   2.2. Apply **MultilayerPerceptron** algorithm with the following settings (the first hidden layer has 250 neuron, the second one has 100 neurons).

In [8]:
# Áp dụng Multilayer Perceptron với cấu hình như yêu cầu
clf = MLPClassifier(hidden_layer_sizes=(250, 100), random_state=42)
clf.fit(X_train, y_train)

# Dự đoán và đánh giá mô hình
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.745


*   2.3. Find the best hyperparameters using **GridSearchCV**

In [7]:
# Tạo đối tượng MLPClassifier
mlp = MLPClassifier(random_state=42)

# Thiết lập các siêu tham số cần tìm kiếm
param_grid = {
    'hidden_layer_sizes': [(250, 100), (300, 200), (200, 100)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive'],
}

# Sử dụng GridSearchCV để tìm kiếm các siêu tham số tốt nhất
grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# In ra các siêu tham số tốt nhất
print("Best hyperparameters:", grid_search.best_params_)

# Dự đoán và đánh giá mô hình với các siêu tham số tốt nhất
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with best hyperparameters:", accuracy)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best hyperparameters: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (300, 200), 'learning_rate': 'constant', 'solver': 'adam'}
Accuracy with best hyperparameters: 0.795


*   2.4. Compare the **MultilayerPerceptron** using the best hyperparameters in 2.3 and other classification algorithms (i.e., Random forest, kNN, Naïve Bayes)  in termns of accuracy, precision, recall, and F1

In [12]:
# Multilayer Perceptron với các siêu tham số tốt nhất từ Task 2.3
mlp_clf = MLPClassifier(hidden_layer_sizes=(250, 100), activation='relu', solver='adam', learning_rate_init=0.01, max_iter=200, random_state=42)
mlp_clf.fit(X_train, y_train)
mlp_y_pred = mlp_clf.predict(X_test)

# Random Forest
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
rf_y_pred = rf_clf.predict(X_test)

# kNN
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_y_pred = knn_clf.predict(X_test)

# Naive Bayes
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
nb_y_pred = nb_clf.predict(X_test)

# Tính toán các độ đo đánh giá cho từng mô hình
mlp_accuracy = accuracy_score(y_test, mlp_y_pred)
mlp_precision = precision_score(y_test, mlp_y_pred, average='weighted')
mlp_recall = recall_score(y_test, mlp_y_pred, average='weighted')
mlp_f1 = f1_score(y_test, mlp_y_pred, average='weighted')

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred, average='weighted')
rf_recall = recall_score(y_test, rf_y_pred, average='weighted')
rf_f1 = f1_score(y_test, rf_y_pred, average='weighted')

knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_precision = precision_score(y_test, knn_y_pred, average='weighted')
knn_recall = recall_score(y_test, knn_y_pred, average='weighted')
knn_f1 = f1_score(y_test, knn_y_pred, average='weighted')

nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_precision = precision_score(y_test, nb_y_pred, average='weighted')
nb_recall = recall_score(y_test, nb_y_pred, average='weighted')
nb_f1 = f1_score(y_test, nb_y_pred, average='weighted')

# In ra kết quả
print("Multilayer Perceptron:")
print("Accuracy:", mlp_accuracy)
print("Precision:", mlp_precision)
print("Recall:", mlp_recall)
print("F1 Score:", mlp_f1)
print()

print("Random Forest:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)
print()

print("kNN:")
print("Accuracy:", knn_accuracy)
print("Precision:", knn_precision)
print("Recall:", knn_recall)
print("F1 Score:", knn_f1)
print()

print("Naive Bayes:")
print("Accuracy:", nb_accuracy)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("F1 Score:", nb_f1)

Multilayer Perceptron:
Accuracy: 0.7575
Precision: 0.7829040963126661
Recall: 0.7575
F1 Score: 0.7633053515168315

Random Forest:
Accuracy: 0.8025
Precision: 0.8031841659644237
Recall: 0.8025
F1 Score: 0.8017195713620683

kNN:
Accuracy: 0.7925
Precision: 0.8134277605480387
Recall: 0.7925
F1 Score: 0.7962572760619492

Naive Bayes:
Accuracy: 0.5425
Precision: 0.5914027896227799
Recall: 0.5425
F1 Score: 0.5157433968845091


#Task 3. With **breast cancer** dataset

In [14]:
# Import scikit-learn dataset library
from sklearn import datasets

# Load dataset
cancer = datasets.load_breast_cancer()

X = cancer.data
y = cancer.target

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

*   3.1. Apply **GridSearchCV** to **MultilayperPerceptron** to find the best hyperparameters (the setting of hyperparameters chosen by students)

In [15]:
# Định nghĩa mô hình Multilayer Perceptron
mlp_clf = MLPClassifier(max_iter=100)

# Thiết lập các siêu tham số để tìm kiếm
param_grid = {
    'hidden_layer_sizes': [(100,), (150,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate_init': [0.001, 0.01],
}

# Tạo GridSearchCV object
grid_search = GridSearchCV(estimator=mlp_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Tiến hành tìm kiếm siêu tham số tốt nhất
grid_search.fit(X_train, y_train)

# In ra các siêu tham số tốt nhất
print("Best hyperparameters:", grid_search.best_params_)

# Dự đoán và đánh giá mô hình với các siêu tham số tốt nhất
best_clf = grid_search.best_estimator_
accuracy = best_clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best hyperparameters: {'activation': 'relu', 'hidden_layer_sizes': (150,), 'learning_rate_init': 0.001, 'solver': 'adam'}
Accuracy: 0.9649122807017544


*   3.2. Compare the **MultilayerPerceptron** using the best hyperparameters in 3.1) and other classification algorithms (i.e., Random forest, kNN, Naïve Bayes)  in termns of accuracy, precision, recall, and F1

In [16]:
# Multilayer Perceptron với các siêu tham số tốt nhất từ Task 3.1
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', learning_rate_init=0.001, max_iter=100, random_state=42)
mlp_clf.fit(X_train, y_train)
mlp_y_pred = mlp_clf.predict(X_test)

# Random Forest
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
rf_y_pred = rf_clf.predict(X_test)

# kNN
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_y_pred = knn_clf.predict(X_test)

# Naive Bayes
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
nb_y_pred = nb_clf.predict(X_test)

# Tính toán các độ đo đánh giá cho từng mô hình
mlp_accuracy = accuracy_score(y_test, mlp_y_pred)
mlp_precision = precision_score(y_test, mlp_y_pred)
mlp_recall = recall_score(y_test, mlp_y_pred)
mlp_f1 = f1_score(y_test, mlp_y_pred)

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)

knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_precision = precision_score(y_test, knn_y_pred)
knn_recall = recall_score(y_test, knn_y_pred)
knn_f1 = f1_score(y_test, knn_y_pred)

nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_precision = precision_score(y_test, nb_y_pred)
nb_recall = recall_score(y_test, nb_y_pred)
nb_f1 = f1_score(y_test, nb_y_pred)

# In ra kết quả
print("Multilayer Perceptron:")
print("Accuracy:", mlp_accuracy)
print("Precision:", mlp_precision)
print("Recall:", mlp_recall)
print("F1 Score:", mlp_f1)
print()

print("Random Forest:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)
print()

print("kNN:")
print("Accuracy:", knn_accuracy)
print("Precision:", knn_precision)
print("Recall:", knn_recall)
print("F1 Score:", knn_f1)
print()

print("Naive Bayes:")
print("Accuracy:", nb_accuracy)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("F1 Score:", nb_f1)



Multilayer Perceptron:
Accuracy: 0.9649122807017544
Precision: 0.958904109589041
Recall: 0.9859154929577465
F1 Score: 0.9722222222222222

Random Forest:
Accuracy: 0.9649122807017544
Precision: 0.958904109589041
Recall: 0.9859154929577465
F1 Score: 0.9722222222222222

kNN:
Accuracy: 0.956140350877193
Precision: 0.9342105263157895
Recall: 1.0
F1 Score: 0.9659863945578232

Naive Bayes:
Accuracy: 0.9736842105263158
Precision: 0.9594594594594594
Recall: 1.0
F1 Score: 0.9793103448275862


#Task 4. With **mobile price classification** dataset


*   4.1. Build your own Neural Network using **MultilayerPerceptron**  



In [17]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Data_Labs_ML/Lab6'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Data_Labs_ML/Lab6


In [18]:
# Đọc dữ liệu từ cả hai tệp CSV
data = pd.read_csv("mobile.csv")

# Chia dữ liệu thành features (X) và labels (y)
X = data.drop('price_range', axis=1)
y = data['price_range']

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

data.tail()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0
1999,510,1,2.0,1,5,1,45,0.9,168,6,...,483,754,3919,19,4,2,1,1,1,3


In [19]:
# Xây dựng mạng Neural Network của riêng bạn với Multilayer Perceptron (MLP)
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,50), activation='relu', solver='adam', max_iter=100, random_state=42)
mlp_clf.fit(X_train, y_train)

# Đánh giá mô hình trên tập kiểm tra
accuracy = mlp_clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.58


*   4.2. Apply **GridSearchCV** to **MultilayperPerceptron** to find the best hyperparameters (the setting of hyperparameters chosen by students)

In [21]:
# Xây dựng mô hình Multilayer Perceptron
mlp_clf = MLPClassifier(max_iter=100)

# Thiết lập các siêu tham số để tìm kiếm
param_grid = {
    'hidden_layer_sizes': [(100,), (150,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate_init': [0.001, 0.01],
}

# Tạo GridSearchCV object
grid_search = GridSearchCV(estimator=mlp_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Tiến hành tìm kiếm siêu tham số tốt nhất
grid_search.fit(X, y)

# In ra các siêu tham số tốt nhất
print("Best hyperparameters:", grid_search.best_params_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best hyperparameters: {'activation': 'tanh', 'hidden_layer_sizes': (150,), 'learning_rate_init': 0.001, 'solver': 'adam'}




#Finally,
Save a copy in your Github. Remember renaming the notebook.