In [23]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [24]:
# Generate a synthetic dataset with 1000 samples and 20 features
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

In [25]:
# Split the dataset into labeled and unlabeled data
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X, y, test_size=0.9, random_state=42)


In [26]:
# Initialize K-Means clustering with 2 clusters
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(X_unlabeled)



In [27]:
# Get the cluster assignments for the unlabeled data
cluster_labels = kmeans.predict(X_unlabeled)

In [28]:
# Determine the majority class for each cluster
cluster_majority_classes = []
for cluster in range(10):
    cluster_indices = np.where(cluster_labels == cluster)[0]
    cluster_classes = y_unlabeled[cluster_indices]
    majority_class = np.argmax(np.bincount(cluster_classes))
    cluster_majority_classes.append(majority_class)

In [29]:
# Identify the indices of samples in clusters with the majority class
majority_indices_unlabeled = []
for cluster, majority_class in enumerate(cluster_majority_classes):
    cluster_indices = np.where(cluster_labels == cluster)[0]
    majority_indices_cluster = np.where(y_unlabeled[cluster_indices] == majority_class)[0]
    majority_indices_unlabeled.extend(cluster_indices[majority_indices_cluster])


In [30]:
# Use the indices to extract the corresponding samples and their labels from the unlabeled data
X_majority = X_unlabeled[majority_indices_unlabeled]
y_majority = y_unlabeled[majority_indices_unlabeled]


In [31]:
# Train a supervised SVM classifier on the labeled data
classifier = SVC(kernel='linear', random_state=42)
classifier.fit(X_labeled, y_labeled)

In [32]:
# Evaluate the classifier on the labeled data
y_pred_labeled = classifier.predict(X_labeled)
accuracy_labeled = accuracy_score(y_labeled, y_pred_labeled)
print(f"Accuracy on labeled data: {accuracy_labeled}")

Accuracy on labeled data: 0.94


In [33]:
# Train a supervised SVM classifier on the majority cluster samples
classifier.fit(X_majority, y_majority)

In [34]:
# Predict labels for all unlabeled data using the trained classifier
y_pred_unlabeled = classifier.predict(X_unlabeled)

In [35]:
# Add confident predictions from the unlabeled data to the labeled data
X_labeled = np.vstack((X_labeled, X_unlabeled))
y_labeled = np.hstack((y_labeled, y_pred_unlabeled))

In [36]:
# Retrain the classifier on the updated labeled data
classifier.fit(X_labeled, y_labeled)

In [37]:
# Evaluate the classifier on the updated labeled data
y_pred_final = classifier.predict(X_labeled)
accuracy_final = accuracy_score(y_labeled, y_pred_final)
print(f"Accuracy after self-training: {accuracy_final}")

Accuracy after self-training: 0.977
