In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease


In [None]:
df = pd.read_csv('data/data/breast_cancer.csv')
df

In [None]:
X = df.drop(columns="target")
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# visualize bagaimana PCA dapat membantu kita

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

pca = PCA(n_components=2, whiten=True) # whiten untuk scaling
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

sns.scatterplot(X_train_pca[:, 0], X_train_pca[:, 1], hue=y_train)


# Training

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp
from jcopml.tuning.space import Real, Integer

In [None]:


preprocessor = ColumnTransformer([
    ("numeric", num_pipe(), X_train.columns)
])

pipeline = Pipeline([
    ("prep", preprocessor),
    ("pca", PCA()),
    ("algo", SVC(max_iter=500))
])

parameter = {
    "pca__n_components": Integer(1, 30),
    "pca__whiten": [True, False],
    "algo__gamma": Real(low=-3, high=3, prior="log-uniform"),
    "algo__C": Real(low=-3, high=3, prior="log-uniform")
}

model = RandomizedSearchCV(pipeline, parameter, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))
