In [1]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# Load dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# Bagi data menjadi data dilabeli dan tidak dilabeli
X_labeled, X_unlabeled, y_labeled, _ = train_test_split(X, y, test_size=0.9, random_state=42)


In [4]:
# Inisialisasi model SVM
classifier = SVC(kernel='linear')

# Latih model awal dengan data dilabeli
classifier.fit(X_labeled, y_labeled)

# Prediksi kelas untuk data tidak dilabeli
predicted_labels = classifier.predict(X_unlabeled)

# Tambahkan data yang diprediksi dengan tingkat kepercayaan tertentu ke dalam data dilabeli
threshold = 0.5
X_labeled = np.vstack([X_labeled, X_unlabeled])
y_labeled = np.hstack([y_labeled, predicted_labels])

# Latih ulang model dengan data baru
classifier.fit(X_labeled, y_labeled)

# Evaluasi model
X_test, y_test = diabetes.data, diabetes.target
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.006787330316742082


In [5]:
y_test

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [6]:
y_pred

array([113., 134., 113., 134., 134., 134., 134., 113., 113., 134., 134.,
       113., 134., 113., 134., 200., 113., 113., 134., 134., 134., 134.,
       134., 113., 134., 134., 134., 134., 134., 113., 134., 134., 113.,
       134., 134., 134., 113., 134., 113., 134., 113., 134., 134., 134.,
       113., 113., 134., 134., 134., 113., 113., 113., 134., 113., 134.,
       134., 134., 134., 134., 113., 134., 113., 134., 134., 134., 113.,
       113., 113., 134., 134., 134., 113., 113., 113., 113., 113., 113.,
       134., 134., 134., 113., 113., 134., 134., 134., 134., 134., 134.,
       134., 134., 134., 113., 113., 134., 134., 134., 113., 113., 134.,
       134., 113., 113., 134., 113., 134., 134., 134., 113., 113., 113.,
       134., 134., 134., 113., 113., 200., 113., 113., 113., 134., 134.,
       113., 113., 200., 134., 113., 134., 134., 134., 113., 113., 134.,
       113., 134., 134., 113., 134., 113., 113., 113., 113., 200., 113.,
       134., 113., 113., 113., 113., 134., 113., 11

In [8]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import clone
import numpy as np

# Load dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# Bagi data menjadi data dilabeli dan tidak dilabeli
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X, y, test_size=0.9, random_state=42)

# Inisialisasi model Random Forest
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fungsi untuk memperbarui dataset dilabeli dengan memilih sampel yang paling tidak pasti (Uncertainty Sampling)
def query_instances(model, X_pool, n_instances=10):
    uncertainty = -model.predict_proba(X_pool).max(axis=1)
    idx = np.argsort(uncertainty)[:n_instances]
    return X_pool[idx], idx

# Fungsi untuk menambahkan sampel baru ke dataset dilabeli
def update_labeled_dataset(X_pool, y_pool, X_new, y_new, idx):
    X_pool = np.delete(X_pool, idx, axis=0)
    y_pool = np.delete(y_pool, idx)
    X_labeled = np.vstack([X_new, X_pool])
    y_labeled = np.hstack([y_new, y_pool])
    return X_labeled, y_labeled

# Latih model awal dengan data dilabeli
classifier.fit(X_labeled, y_labeled)

# Evaluasi model awal
X_test, y_test = diabetes.data, diabetes.target
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Initial model accuracy:", accuracy)

# Iterasi active learning
n_iterations = 5
for i in range(n_iterations):
    # Prediksi kelas untuk data tidak dilabeli dan pilih sampel yang paling tidak pasti
    X_pool, idx = query_instances(classifier, X_unlabeled)
    y_pool = classifier.predict(X_pool)

    # Tambahkan sampel yang dipilih ke dalam dataset dilabeli
    X_labeled, y_labeled = update_labeled_dataset(X_unlabeled, y_unlabeled, X_pool, y_pool, idx)

    # Latih ulang model dengan data baru
    classifier.fit(X_labeled, y_labeled)

    # Evaluasi model
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Iteration {i+1} accuracy:", accuracy)


Initial model accuracy: 0.09954751131221719
Iteration 1 accuracy: 0.8823529411764706
Iteration 2 accuracy: 0.9004524886877828
Iteration 3 accuracy: 0.9004524886877828
Iteration 4 accuracy: 0.8981900452488688
Iteration 5 accuracy: 0.9004524886877828


In [10]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into labeled and unlabeled sets
X_labeled, X_unlabeled, y_labeled, _ = train_test_split(X, y, test_size=0.8, random_state=42)

# Scale features
scaler = StandardScaler()
X_labeled = scaler.fit_transform(X_labeled)
X_unlabeled = scaler.transform(X_unlabeled)

# Initialize base classifier (k-NN)
base_classifier = KNeighborsClassifier(n_neighbors=3)

# Initialize self-training classifier
self_training_clf = SelfTrainingClassifier(base_classifier)

# Fit the self-training classifier on labeled and unlabeled data
self_training_clf.fit(X_labeled, y_labeled)

# Predict on the unlabeled data
pseudo_labels = self_training_clf.predict(X_unlabeled)

# Incorporate pseudo-labels into labeled set
X_pseudo_labeled = X_unlabeled
y_pseudo_labeled = pseudo_labels

# Retrain the classifier on the combined labeled data
X_combined = np.vstack((X_labeled, X_pseudo_labeled))
y_combined = np.concatenate((y_labeled, y_pseudo_labeled))
self_training_clf.fit(X_combined, y_combined)

# Evaluate the classifier
y_pred = self_training_clf.predict(X)
accuracy = accuracy_score(y, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.3333333333333333




### sas

In [38]:
from sklearn import metrics,datasets,semi_supervised,svm
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into labeled and unlabeled sets
X_labeled, X_unlabeled, y_labeled,y_hide = train_test_split(X, y, test_size=0.5, random_state=42)


In [46]:
y_hh=np.concatenate([y_labeled,y_hide])
y_hh

array([1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0,
       0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2, 1,
       1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1,
       1, 2, 2, 0, 1, 2, 0, 1, 2, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0,
       0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0,
       2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0,
       0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2, 1])

In [40]:
y_hh=np.concatenate([y_labeled,y_hide])
y_hh
X_new=np.concatenate([X_labeled,X_unlabeled])
y_new=np.concatenate([y_labeled,y_unlabeled])
y_new

array([ 1,  2,  1,  0,  1,  2,  0,  0,  1,  1,  0,  2,  0,  0,  1,  1,  2,
        1,  2,  2,  1,  0,  0,  2,  2,  0,  0,  0,  1,  2,  0,  2,  2,  0,
        1,  1,  2,  1,  2,  0,  2,  1,  2,  1,  1,  1,  0,  1,  1,  0,  1,
        2,  2,  0,  1,  2,  2,  0,  2,  0,  1,  2,  2,  1,  2,  1,  1,  2,
        2,  0,  1,  2,  0,  1,  2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [45]:
svc=SVC(probability=True)
lbl=semi_supervised.SelfTrainingClassifier(svc)
lbl.fit(X_new,y_new)
lbl.predict(X_new)


lbl=semi_supervised.LabelPropagation()
lbl.fit(X_new,y_new)
lbl.predict(X_new)

lbl=semi_supervised.LabelSpreading()
lbl.fit(X_new,y_new)
lbl.predict(X_new)

array([1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 2, 2, 2, 1, 0,
       0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 1, 1, 2, 0, 2, 1, 2, 1,
       1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1,
       1, 2, 2, 0, 1, 1, 0, 1, 2, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0,
       0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0,
       2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 1, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0,
       0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1])

In [47]:
metrics.accuracy_score(lbl.predict(X_new),y_hh)

0.9666666666666667

In [49]:
lbl=semi_supervised.LabelPropagation()
lbl.fit(X_new,y_new)
lbl.predict(X_new)

lbl=semi_supervised.LabelSpreading()
lbl.fit(X_new,y_new)
lbl.predict(X_new)

array([1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0,
       0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2, 1,
       1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1,
       1, 2, 2, 0, 1, 2, 0, 1, 2, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0,
       0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0,
       2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 1, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0,
       0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2, 1])

In [50]:
metrics.accuracy_score(lbl.predict(X_new),y_hh)

0.9933333333333333

In [51]:
lbl=semi_supervised.LabelSpreading()
lbl.fit(X_new,y_new)
lbl.predict(X_new)

array([1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0,
       0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2, 1,
       1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1,
       1, 2, 2, 0, 1, 2, 0, 1, 2, 1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0,
       0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0,
       2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 1, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0,
       0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 1])

In [52]:
metrics.accuracy_score(lbl.predict(X_new),y_hh)

0.9866666666666667

In [55]:
from sklearn import metrics,datasets,semi_supervised,svm
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into labeled and unlabeled sets
X_labeled, X_unlabeled, y_labeled,y_hide = train_test_split(X, y, test_size=0.5, random_state=42)



y_hh=np.concatenate([y_labeled,y_hide])


X_new=np.concatenate([X_labeled,X_unlabeled])
y_new=np.concatenate([y_labeled,y_unlabeled])




# Define pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SelfTrainingClassifier(SVC(probability=True)))
])

# Define parameter grid for SVC
param_grid = {'clf__base_estimator__C': [0.1, 1, 10], 'clf__base_estimator__gamma': [0.1, 0.01, 0.001]}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_labeled, y_labeled)




Pipeline(steps=[('scaler', StandardScaler()),
                ('clf',
                 SelfTrainingClassifier(base_estimator=SVC(C=10, gamma=0.1,
                                                           probability=True)))])

In [56]:
# Predict on the unlabeled data
pseudo_labels = pipeline.predict(X_unlabeled)

# Incorporate pseudo-labels into labeled set
X_pseudo_labeled = X_unlabeled
y_pseudo_labeled = pseudo_labels

# Retrain the classifier on the combined labeled data
X_combined = np.vstack((X_labeled, X_pseudo_labeled))
y_combined = np.concatenate((y_labeled, y_pseudo_labeled))
pipeline.fit(X_combined, y_combined)




Pipeline(steps=[('scaler', StandardScaler()),
                ('clf',
                 SelfTrainingClassifier(base_estimator=SVC(C=10, gamma=0.1,
                                                           probability=True)))])

In [54]:
# Evaluate the classifier
y_pred = pipeline.predict(X)
accuracy = accuracy_score(y, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9733333333333334
