In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

# load the dataset
data = pd.read_csv('data02.csv')
X = pd.DataFrame(data.data, columns=data.feature_names).head(200)
y = pd.Series(data.target).head(200)

# outlier treatment
X = X.apply(lambda x: np.log1p(x))

# pipeline with hyperparameter tuning using GridSearchCV
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    
    ('rf', RandomForestClassifier())
])

params = {
    'rf__n_estimators': [50, 100, 200, 200],
    'rf__max_depth': [3, 5, 10, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, params, cv=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search.fit(X_train, y_train)

# print the best hyperparameters
print(grid_search.best_params_)

# evaluate the model on the test set
accuracy = grid_search.score(X_test, y_test)
print(f"Test set accuracy: {accuracy:.3f}")


{'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 50}
Test set accuracy: 0.925
