In [44]:
# import libary
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def apply_pipeline(steps, x_train, y_train, x_test, y_test, memory=None, verbose=False):
    pl = Pipeline(steps=steps, memory=memory, verbose=verbose)
    pl.fit(x_train, y_train)
    y_pred = pl.predict(x_test)
    accuracy = round(accuracy_score(y_test, y_pred), 2)
    precision = round(precision_score(y_test, y_pred, average='macro'), 2)
    recall = round(recall_score(y_test, y_pred, average='macro'), 2)
    f1 = round(f1_score(y_test, y_pred, average='macro'), 2)
    return accuracy, precision, recall, f1

In [3]:
def fine_tune_with_pipeline_classifier(pl, x_train, y_train, x_test, y_test, param_grid, average='macro',
                                       scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
                                       error_score=np.nan, return_train_score=False):
  grid_search = GridSearchCV(estimator=pl, param_grid=param_grid, scoring=scoring,
                             n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose,
                             pre_dispatch=pre_dispatch, error_score=error_score,return_train_score=return_train_score)
  grid_search.fit(x_train, y_train)
  y_pred = grid_search.predict(x_test)
  accuracy = round(accuracy_score(y_test, y_pred), 2)
  precision = round(precision_score(y_test, y_pred, average=average), 2)
  recall = round(recall_score(y_test, y_pred, average=average), 2)
  f1 = round(f1_score(y_test, y_pred, average=average), 2)
  return accuracy, precision, recall, f1

In [4]:
param_grids = {
    'RandomForest': {
        'pca__n_components': [2, 3],
        'classifier__n_estimators': [25, 50, 100, 150],
        'classifier__max_features': ['sqrt', 'log2', None],
        'classifier__max_depth': [3, 6, 9],
        'classifier__max_leaf_nodes': [3, 6, 9]
    },
    'LogisticRegression': {
        'pca__n_components': [2, 3],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear']
    },
    'KNN': {
        'pca__n_components': [2, 3],
        'classifier__n_neighbors': [3, 5, 7, 10],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    },
    'SVM': {
        'pca__n_components': [2, 3],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf', 'poly'],
        'classifier__gamma': ['scale', 'auto']
    }
}

In [5]:
models = {
    'RandomForest': Pipeline([
        ('scl', StandardScaler(with_mean=False)),
        ('pca', PCA(n_components=3)),
        ('classifier', RandomForestClassifier(random_state=42))
    ]),

    'LogisticRegression': Pipeline([
        ('scl', StandardScaler(with_mean=False)),
        ('pca', PCA(n_components=3)),
        ('classifier', LogisticRegression(max_iter=5000, random_state=42))
    ]),

    'SVM': Pipeline([
        ('scl', StandardScaler(with_mean=False)),
        ('pca', PCA(n_components=3)),
        ('classifier', SVC(random_state=42))
    ]),

    'KNN': Pipeline([
        ('scl', StandardScaler(with_mean=False)),
        ('pca', PCA(n_components=3)),
        ('classifier', KNeighborsClassifier())
    ])
}


# TASK 1

In [6]:
iris = load_iris()

***1.1. Apply Pipeline including StandardScaler and PCA(3 principal components) to the iris dataset with the RandomForest classifier as a final estimator.***

In [7]:
steps = [('scl', StandardScaler()),
        ('pca', PCA(n_components=3)),
        ('clf', RandomForestClassifier(random_state=42))]

x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
accuracy, precision, recall, f1 = apply_pipeline(steps, x_train, y_train, x_test, y_test)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Accuracy: 0.97
Precision: 0.95
Recall: 0.97
F1: 0.96


***1.2. Find the best hyperparameters using GridSearchCV for algorithms (Logistic Regression, Random Forest, kNN, SVM) on the preprocessed dataset in the previous step. Then compare the performance of classifiers using accuracy precision, recall, and F1 measures.***

In [8]:
for model_name in models:
  print(f'Model: {model_name}')
  accuracy, precision, recall, f1 = fine_tune_with_pipeline_classifier(models[model_name], x_train, y_train, x_test, y_test, param_grids[model_name])
  print(f'Accuracy: {accuracy}')
  print(f'Precision: {precision}')
  print(f'Recall: {recall}')
  print(f'F1: {f1}')
  print()


Model: RandomForest
Accuracy: 0.93
Precision: 0.92
Recall: 0.95
F1: 0.92

Model: LogisticRegression
Accuracy: 0.9
Precision: 0.89
Recall: 0.92
F1: 0.89

Model: SVM
Accuracy: 0.97
Precision: 0.95
Recall: 0.97
F1: 0.96

Model: KNN
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0



# TASK 2

In [9]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd '/content/drive/MyDrive/Colab Notebooks/ML_2425/Lab 6_Classifier2'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/ML_2425/Lab 6_Classifier2


In [10]:
dataset_news = pd.read_csv('news.csv')
dataset_news.drop_duplicates(keep='first', inplace=True)
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

def preprosessing(txt):
  tokens = word_tokenize(txt)
  words = [lemma.lemmatize(token) for token in tokens if
token.isalpha() and token not in stop_words]
  return ' '.join(words)

txt = dataset_news['data'][0]
txt2 = preprosessing(txt)

dataset_news['data'] = dataset_news['data'].apply(preprosessing)

encoder = LabelEncoder()
dataset_news['labels'] = encoder.fit_transform(dataset_news['labels'])

x_train_news, x_test_news, y_train_news, y_test_news = train_test_split(dataset_news['data'], dataset_news['labels'], test_size=0.2, random_state=42)

vectorize = TfidfVectorizer()
x_train_cv = vectorize.fit_transform(x_train_news)
x_test_cv = vectorize.transform(x_test_news)

***2.1. Apply Pipeline including StandardScaler and RandomForest(select features based on ‘mean’ threshold) to the above dataset with the RandomForest classifier as an estimator.***

In [12]:
steps_news = [
    ('scl', StandardScaler(with_mean=False)),
    ('feature, selection', SelectFromModel(RandomForestClassifier(n_estimators=100))),
    ('classifier', RandomForestClassifier(random_state=42))
    ]

accuracy, precision, recall, f1 = apply_pipeline(steps_news, x_train_cv, y_train_news, x_test_cv, y_test_news)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1: 0.98


***2.2. Find the best hyperparameters using GridSearchCV for algorithms (Logistic Regression, Random Forest, kNN, SVM)on the preprocessed dataset in the previous steps. Then compare the results using accuracy, precision, recall, and F1 measures.***

In [13]:
for model_name, pipeline in models.items():
  print(f'Model: {model_name}')
  accuracy, precision, recall, f1 = fine_tune_with_pipeline_classifier(pipeline, x_train_cv, y_train_news, x_test_cv, y_test_news, param_grids[model_name], error_score='raise')
  print(f'Accuracy: {accuracy}')
  print(f'Precision: {precision}')
  print(f'Recall: {recall}')
  print(f'F1: {f1}')
  print()

Model: RandomForest
Accuracy: 0.55
Precision: 0.56
Recall: 0.54
F1: 0.54

Model: LogisticRegression
Accuracy: 0.57
Precision: 0.56
Recall: 0.55
F1: 0.52

Model: SVM
Accuracy: 0.6
Precision: 0.59
Recall: 0.59
F1: 0.58

Model: KNN
Accuracy: 0.6
Precision: 0.61
Recall: 0.6
F1: 0.59



# TASK 3

In [14]:
dataset_bank = pd.read_csv('bank.csv')

***3.1. Apply Pipeline to StandardScaler() function to columns that contain numerical data ('age', 'balance', 'day', 'campaign', 'pdays', 'previous'); apply Encode Categorical Value (OneHotEncoder) to transform categorical data to numerical data ('job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome');***

In [15]:
numrical_features = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [33]:
num_pipeline = Pipeline([
    ('scl', StandardScaler())
])
cat_pipeline = Pipeline([
    ('ohe', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numrical_features),
    ('cat', cat_pipeline, categorical_features)
])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

x = dataset_bank.drop('deposit', axis=1)
y = dataset_bank['deposit']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train_processed = pipeline.fit_transform(x_train)
x_test_processed = pipeline.transform(x_test)

x_preprocessed = preprocessor.fit_transform(x)
print(x_train_processed.shape)

(8929, 50)


***3.2. Apply a selection feature technique on the dataset preprocessed in Task 3.1***

In [39]:
rf_bank = RandomForestClassifier(n_estimators=100, random_state=42)
rf_bank.fit(x_train_processed, y_train)

selector_bank = SelectFromModel(rf_bank)
x_train_selected = selector_bank.fit_transform(x_train_processed, y_train)
x_test_selected = selector_bank.transform(x_test_processed)

print(f'Số lượng đặc trưng đã chọn: {x_train_selected.shape[1]}')

selected_columns = selector_bank.get_support(indices=True)
selected_features = (preprocessor.transformers_[0][1].get_feature_names_out(numrical_features).tolist() +
                     preprocessor.transformers_[1][1].named_steps['ohe'].get_feature_names_out(categorical_features).tolist())

selected_features = [selected_features[i] for i in selected_columns]

print("Các đặc trưng đã chọn:", selected_features)

Số lượng đặc trưng đã chọn: 9
Các đặc trưng đã chọn: ['age', 'balance', 'day', 'campaign', 'pdays', 'previous', 'contact_cellular', 'contact_unknown', 'poutcome_success']


***3.3. Then, compare the performance of classification algorithms (Logistic Regression Random forest, kNN, SVM) using accuracy, precision recall, and F1 measures.***

In [42]:
models_bank = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(random_state=42)
}

for name, model in models.items():
  print(f'Model: {name}')
  model.fit(x_train_selected, y_train)
  y_pred = model.predict(x_test_selected)
  print(f'Accuracy: {round(accuracy_score(y_test, y_pred), 2)}')
  print(f'Precision: {round(precision_score(y_test, y_pred, average="macro"), 2)}')
  print(f'Recall: {round(recall_score(y_test, y_pred, average="macro"), 2)}')
  print(f'F1: {round(f1_score(y_test, y_pred, average="macro"), 2)}')
  print()

Model: RandomForest
Accuracy: 0.64
Precision: 0.64
Recall: 0.64
F1: 0.64

Model: LogisticRegression
Accuracy: 0.63
Precision: 0.64
Recall: 0.62
F1: 0.61

Model: SVM
Accuracy: 0.66
Precision: 0.66
Recall: 0.65
F1: 0.65

Model: KNN
Accuracy: 0.63
Precision: 0.63
Recall: 0.63
F1: 0.63



***3.4. Find the best hyperparameters using GridSearchCV for algorithms (Logistic
Regression, Random Forest, kNN, SVM) on the preprocessed dataset in the previous steps.
Then compare the results using accuracy, precision, recall, and F1 measures.***

In [43]:
for model_name, pipeline in models.items():
  print(f'Model: {model_name}')
  accuracy, precision, recall, f1 = fine_tune_with_pipeline_classifier(pipeline, x_train_selected, y_train, x_test_selected, y_test, param_grids[model_name])
  print(f'Accuracy: {accuracy}')
  print(f'Precision: {precision}')
  print(f'Recall: {recall}')
  print(f'F1: {f1}')
  print()

Model: RandomForest
Accuracy: 0.65
Precision: 0.66
Recall: 0.66
F1: 0.65

Model: LogisticRegression
Accuracy: 0.63
Precision: 0.64
Recall: 0.62
F1: 0.61

Model: SVM
Accuracy: 0.66
Precision: 0.66
Recall: 0.65
F1: 0.65

Model: KNN
Accuracy: 0.64
Precision: 0.64
Recall: 0.63
F1: 0.62



***3.5. Plot ROC curve and report the AUC values for classification results on the test set in the
previous task.***

In [None]:
plt.figure(figsize=(10, 8))
auc_scores = {}