#Import libraries

In [53]:
from sklearn import datasets
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as m
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from prettytable import PrettyTable
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Mount Drive

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Colab Notebooks/ML_2425/Lab 5_Classification1'

Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks/ML_2425/Lab 5_Classification1


#Dataset mobile (for task #2)

In [4]:
mobile_train = pd.read_csv('mobile_train.csv')
mobile_test = pd.read_csv('mobile_test.csv')
mobile_test.drop(columns=['id'], inplace=True)

mobile_X = mobile_train.drop('price_range', axis=1)
mobile_y = mobile_train['price_range']
features = mobile_X.columns

# Áp dụng lựa chọn đặc trưng cho tập mobile_train.

In [5]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(mobile_X, mobile_y)
print('Feature Importance: ', rf.feature_importances_)
#các thuộc tính có mức độ quan trọng từ giá trị trung bình của tất cả các mức độ quan trọng của các thuộc tính trở lên sẽ được giữ lại
selector = SelectFromModel(rf, threshold="mean", prefit=True)
X_new = selector.transform(mobile_X)
print('Selected Features: ', features[selector.get_support()])

Feature Importance:  [0.07460184 0.0063116  0.02606854 0.00644839 0.02427436 0.00623685
 0.03469866 0.02329284 0.03895964 0.02203571 0.02745886 0.05563956
 0.05744761 0.49416339 0.02654661 0.02789211 0.03034681 0.00470159
 0.00662019 0.00625484]
Selected Features:  Index(['battery_power', 'px_height', 'px_width', 'ram'], dtype='object')




# Sử dụng lại selector đã thực hiện với tập mobile_train để lấy các đặc trưng tương ứng cho tập mobile_test

In [None]:
#code
mobile_test_new = selector.transform(mobile_test.values)#chỉ lấy values, không lấy tên features

In [None]:
mobile_test_new.shape
#lấy đúng số lượng features đã được giữ lại ở tập mobile_train. Vì nếu không lấy đúng số features này thì mô hình đã train với
#các mẫu dữ liệu ở tập mobile_train sẽ không dự đoán y (output) cho các mẫu dữ liệu trong tập test

(1000, 4)

# TASK 1

***1.1. Train an SVM model to the above dataset using the linear kernel (70% for the training set and 30% for the test set).***

In [38]:
cancer = datasets.load_breast_cancer()
x, y = cancer.data, cancer.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [39]:
svm_model_cancer = svm.SVC(kernel='linear', C=1, random_state=42)
svm_model_cancer.fit(x_train, y_train)
y_pred = svm_model_cancer.predict(x_test)
print('Accuracy: ', m.accuracy_score(y_test, y_pred))
print('Classsification Report: \n', m.classification_report(y_test, y_pred))

Accuracy:  0.9649122807017544
Classsification Report: 
               precision    recall  f1-score   support

           0       0.97      0.94      0.95        63
           1       0.96      0.98      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



***1.2. Compare the performance of the SVM model with those of models trained by Logistic Regression, Decision Tree, and kNN algorithms based on metrics: accuracy, precision, recall, and f1 score.***

In [20]:
models = {
    'SVM': svm_model_cancer,
    'Logistic Regression': LogisticRegression(max_iter=10000, random_state=42),
    'Decision Tree': tree.DecisionTreeClassifier(random_state=42),
    'kNN': KNeighborsClassifier()
}

results = {}

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    results[name] = {
        'Accuracy': m.accuracy_score(y_test, y_pred),
        'Precision': m.precision_score(y_test, y_pred),
        'Recall': m.recall_score(y_test, y_pred),
        'F1 Score': m.f1_score(y_test, y_pred)
    }
results_dt = pd.DataFrame(results)
print(results_dt)

                SVM  Logistic Regression  Decision Tree       kNN
Accuracy   0.964912             0.976608       0.941520  0.959064
Precision  0.963636             0.981481       0.971154  0.946903
Recall     0.981481             0.981481       0.935185  0.990741
F1 Score   0.972477             0.981481       0.952830  0.968326


***1.3. Apply feature selection to select the best features for the output. Then, train the models using Logistic Regression, Decision Tree, and kNN algorithms. Compare the performance of these models based on metrics: accuracy, precision, recall, and f1 score.***

In [36]:
selctor_cancer = SelectKBest(score_func=f_classif, k=10)
x_new = selctor_cancer.fit_transform(x, y)

x_train_fs, x_test_fs, y_train_fs, y_test_fs = train_test_split(x_new, y, test_size=0.3, random_state=42)

scaler_fs = StandardScaler()
x_train_fs_scaled = scaler_fs.fit_transform(x_train_fs)
x_test_fs_scaled = scaler_fs.transform(x_test_fs)

models_fs = {
    'Logistic Regression (FS)': LogisticRegression(max_iter=10000, random_state=42),
    'Decision Tree (FS)': tree.DecisionTreeClassifier(random_state=42),
    'kNN (FS)': KNeighborsClassifier()
}

results_fs = {}

for name, model in models_fs.items():
  if 'Logistic' in name or 'kNN' in name:
    model.fit(x_train_fs_scaled, y_train_fs)
    y_pred = model.predict(x_test_fs_scaled)
  else:
    model.fit(x_train_fs, y_train_fs)
    y_pred = model.predict(x_test_fs)
  results_fs[name] = {
      'Accuracy': m.accuracy_score(y_test_fs, y_pred),
      'Precision': m.precision_score(y_test_fs, y_pred),
      'Recall': m.recall_score(y_test_fs, y_pred),
      'F1 Score': m.f1_score(y_test_fs, y_pred)
    }

results_fs_dt = pd.DataFrame(results_fs)
print(results_fs_dt)

           Logistic Regression (FS)  Decision Tree (FS)  kNN (FS)
Accuracy                   0.964912            0.941520  0.947368
Precision                  0.981132            0.953704  0.962617
Recall                     0.962963            0.953704  0.953704
F1 Score                   0.971963            0.953704  0.958140


# TASK 2

***2.1. Evaluate the performance of SVM (using different kernels, including Linear Kernel,Polynomial Kernel, Sigmoid Kernel, and Radial Basis Function Kernel) with mobile price classification based on accuracy, precision, recall, and f1 score.***

In [46]:
x_train_mobile, x_test_mobile, y_train_mobile, y_test_mobile = train_test_split(mobile_X, mobile_y, test_size=0.3, random_state=42)

scaler_mobile = StandardScaler()
x_train_mobile_scaled = scaler_mobile.fit_transform(x_train_mobile)
x_test_mobile_scaled = scaler_mobile.transform(x_test_mobile)

kenerls = ['linear', 'poly', 'sigmoid', 'rbf']
for kernel in kenerls:
  print(f"\nĐánh giá SVM với kernel: {kernel.upper()}")
  model_mobile = svm.SVC(kernel=kernel, C=1, random_state=42)
  model_mobile.fit(x_train_mobile_scaled, y_train_mobile)
  y_pred_mobile = model_mobile.predict(x_test_mobile_scaled)

  accuracy = m.accuracy_score(y_test_mobile, y_pred_mobile)
  precision = m.precision_score(y_test_mobile, y_pred_mobile, average='weighted')
  recall = m.recall_score(y_test_mobile, y_pred_mobile, average='weighted')
  f1 = m.f1_score(y_test_mobile, y_pred_mobile, average='weighted')

  print('Accuracy: ', accuracy)
  print('Precision: ', precision)
  print('Recall: ', recall)
  print('F1 Score: ', f1)


Đánh giá SVM với kernel: LINEAR
Accuracy:  0.9533333333333334
Precision:  0.955198925268421
Recall:  0.9533333333333334
F1 Score:  0.9534558763219786

Đánh giá SVM với kernel: POLY
Accuracy:  0.7833333333333333
Precision:  0.7969143041935092
Recall:  0.7833333333333333
F1 Score:  0.7875706979000574

Đánh giá SVM với kernel: SIGMOID
Accuracy:  0.9116666666666666
Precision:  0.9132463699119626
Recall:  0.9116666666666666
F1 Score:  0.9120165515686817

Đánh giá SVM với kernel: RBF
Accuracy:  0.87
Precision:  0.8726288634654535
Recall:  0.87
F1 Score:  0.8708643354962748


***2.2. Apply feature selection to select the best features for the output. Then, train the models using Logistic Regression, Decision Tree, and kNN algorithms. Compare the performance of these models based on metrics: accuracy, precision, recall, and f1 score.***

In [47]:
selector_mobile = SelectKBest(score_func=f_classif, k=10)
selector_mobile.fit(mobile_X, mobile_y)

scaler_fs_mobile = StandardScaler()
x_train_fs_mobile_scaled = scaler_fs_mobile.fit_transform(x_train_mobile)
x_test_fs_mobile_scaled = scaler_fs_mobile.transform(x_test_mobile)

results_fs_mobile = {}

for name, model in models_fs.items():
  model.fit(x_train_fs_mobile_scaled, y_train_mobile)
  y_pred = model.predict(x_test_fs_mobile_scaled)

  accuracy = m.accuracy_score(y_test_mobile, y_pred)
  precision, recall, f1, _ = m.precision_recall_fscore_support(y_test_mobile, y_pred, average='weighted')
  results_fs_mobile[name] = {
      'Accuracy': accuracy,
      'Precision': precision,
      'Recall': recall,
      'F1 Score': f1
    }

results_fs_df_mobile = pd.DataFrame(results_fs_mobile)
print(results_fs_df_mobile)

           Logistic Regression (FS)  Decision Tree (FS)  kNN (FS)
Accuracy                   0.958333            0.820000  0.513333
Precision                  0.959738            0.822563  0.539994
Recall                     0.958333            0.820000  0.513333
F1 Score                   0.958328            0.820831  0.516935


# TASK 3

3.1. Apply the following preprocessing techniques to features:
− Rescale numerical features ('duration', 'credit_amount', 'installment_commitment',
'residence_since', 'age', 'existing_credits', 'num_dependents') using StandardScaler
− Encode all categorical features ('credit_history', 'purpose', 'personal_status',
'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job',
'own_telephone', 'foreign_worker') using OneHotEncoder
− Encode all ordinal features ('checking_status', 'savings_status', 'employment') using
OrdinalEncoder

In [48]:
df_credit = pd.read_csv('credit.csv')
df_credit.head()

Unnamed: 0.1,Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [54]:
numerical_features = ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_features = ['credit_history', 'purpose', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

ordinal_features = ['checking_status', 'savings_status', 'employment']
ordinal_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('ord', ordinal_transformer, ordinal_features)
    ])

x_preprocessed = preprocessor.fit_transform(df_credit)

print(x_preprocessed)

[[-1.23647786 -0.74513141  0.91847717 ...  1.          4.
   3.        ]
 [ 2.24819436  0.94981679 -0.87018333 ...  0.          2.
   0.        ]
 [-0.73866754 -0.41656241 -0.87018333 ...  3.          2.
   1.        ]
 ...
 [-0.73866754 -0.87450324  0.91847717 ...  3.          2.
   3.        ]
 [ 1.9992892  -0.50552769  0.91847717 ...  1.          2.
   0.        ]
 [ 1.9992892   0.46245715  0.02414692 ...  0.          0.
   4.        ]]


***3.2. Compare the performance of the SVM model with those of models trained by Logistic Regression, Decision Tree, and kNN algorithms based on metrics: accuracy, precision, recall, and f1 score.***

In [58]:
x = df_credit.drop('class', axis=1)
y = df_credit['class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

models = {
    'SVM': svm.SVC(),
    'Logistic Regression': LogisticRegression(max_iter=10000, random_state=42),
    'Decision Tree': tree.DecisionTreeClassifier(random_state=42),
    'kNN': KNeighborsClassifier()
}

results_credit = {}

for name, model in models.items():
  clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  results_credit[name] = {
      'Accuracy': m.accuracy_score(y_test, y_pred),
      'Precision': m.precision_score(y_test, y_pred, average='weighted'),
      'Recall': m.recall_score(y_test, y_pred, average='weighted'),
      'F1 Score': m.f1_score(y_test, y_pred, average='weighted')
  }

results_credit_df = pd.DataFrame(results_credit)
print(results_credit_df)

                SVM  Logistic Regression  Decision Tree       kNN
Accuracy   0.753333             0.736667       0.683333  0.726667
Precision  0.746284             0.718087       0.678506  0.712406
Recall     0.753333             0.736667       0.683333  0.726667
F1 Score   0.717645             0.715490       0.680732  0.716035


***3.3. Apply feature selection to select the best features for the output. Then, train the models using Logistic Regression, Decision Tree, and kNN algorithms. Compare the performance of these models based on metrics: accuracy, precision, recall, and f1 score.***

In [60]:
for name, model in models.items():
  clf = Pipeline(steps=[('preprocessor', preprocessor), ('feature_selection', SelectKBest(score_func=f_classif, k=10)), ('classifier', model)])
  clf.fit(x_train, y_train)
  y_pred = clf.predict(x_test)
  accuracy = m.accuracy_score(y_test, y_pred)
  precision, recall, f1, _ = m.precision_recall_fscore_support(y_test, y_pred, average='weighted')

  print(f"\nKết quả mô hình (với Feature Selection): {name}")
  print('Accuracy: ', accuracy)
  print('Precision: ', precision)
  print('Recall: ', recall)
  print('F1 Score: ', f1)


Kết quả mô hình (với Feature Selection): SVM
Accuracy:  0.7233333333333334
Precision:  0.6990051679586563
Recall:  0.7233333333333334
F1 Score:  0.6868830534580562

Kết quả mô hình (với Feature Selection): Logistic Regression
Accuracy:  0.7133333333333334
Precision:  0.68392064538406
Recall:  0.7133333333333334
F1 Score:  0.6743537943537944

Kết quả mô hình (với Feature Selection): Decision Tree
Accuracy:  0.68
Precision:  0.6666612311455361
Recall:  0.68
F1 Score:  0.6718518518518519

Kết quả mô hình (với Feature Selection): kNN
Accuracy:  0.7066666666666667
Precision:  0.6814323875532453
Recall:  0.7066666666666667
F1 Score:  0.6840054213505542
