## This experiment simulates a semi-supervised learning approach where we start with limited labeled data and use a classifier to gradually label the remaining data with high confidence. 

In [96]:
import numpy as np
import pandas as pd 
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [97]:
# Generate synthetic data
# We create a dataset with 1000 samples, 10 features, and 4 classes for multi-class classification.
# n_informative=4 means only 4 out of 10 features provide meaningful information for classification.
X, y = make_classification(n_samples=1000, n_features=10, n_classes=4, n_informative=4)

In [98]:
# Split dataset into training and testing
# We split the dataset into:
# Training set (80%) → Used for semi-supervised learning.
# Testing set (20%) → Used to evaluate final model performance.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
# Initial labeled and unlabeled data split
# We simulate a real-world scenario where only a small portion of the dataset is initially labeled.
# The first 400 samples are labeled, while the remaining 400 samples are treated as unlabeled.
# The actual labels of X_unlabel are stored in y_unlabel (but not used for training) to later evaluate how well the semi-supervised learning worked.
X_label, y_label = X_train[:400], y_train[:400]
X_unlabel, y_unlabel = X_train[400:], y_train[400:]  # Save actual labels for evaluation

In [100]:
xgb = XGBClassifier(objective = "multi:softmax", num_classes = 4)

In [101]:
# Convert to DataFrame
# We convert X_label, y_label, and X_unlabel to Pandas DataFrames for easier manipulation and concatenation during the label propagation process.
X_label, y_label = pd.DataFrame(X_label), pd.DataFrame(y_label)
X_unlabel, y_unlabel = pd.DataFrame(X_unlabel), pd.DataFrame(y_unlabel)

## The steps in this iterative process are performed to implement a semi-supervised learning approach, where a model trains on labeled data and gradually labels the unlabeled data with high confidence.

In [91]:
while True:
    model = XGBClassifier(objective="multi:softmax", num_classes=4, random_state=42)
    model.fit(X_label, y_label.values.ravel())

    # Since X_unlabel has no labels, we use predict_proba() to get probability scores for each class.
    # This helps us identify which samples the model is most confident about.
    y_pred_probs = model.predict_proba(X_unlabel)

    # We filter only those samples where the highest predicted class probability is above 90%.
    # This ensures that we only add highly reliable predictions to the labeled dataset.
    # If the model is uncertain, we don't add the sample to avoid introducing noise.
    confident_indexes = np.where(y_pred_probs.max(axis=1) > 0.90)[0]

    if not confident_indexes.size:
        break  # Stop if no confident predictions

    # Append high-confidence samples to labeled dataset
    X_label = pd.concat([X_label, X_unlabel.iloc[confident_indexes]])
    y_label = pd.concat([y_label, pd.DataFrame(y_pred_probs[confident_indexes].argmax(axis=1))])

    # Drop used samples from unlabeled dataset and reset index
    X_unlabel.drop(confident_indexes, inplace=True)
    X_unlabel.reset_index(drop=True, inplace=True)

## Logic for above code
### 1. Train the model on the currently labeled dataset (X_label, y_label).
### 2. Predict probabilities for all remaining unlabeled samples (X_unlabel).
### 3. Find samples where the model is highly confident (probability > 90%).
### 4. Add these samples to the labeled dataset (X_label, y_label).
### 5. Remove these samples from the unlabeled dataset (X_unlabel).
### 6. Repeat the process until no more high-confidence samples remain.

In [92]:
# Evaluate final model
final_model = XGBClassifier(objective="multi:softmax", num_classes=4, random_state=42)
final_model.fit(X_label, y_label.values.ravel())

In [93]:
y_pred_final = final_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Final Accuracy: {final_accuracy:.4f}")

Final Accuracy: 0.7300


In [89]:
X_label.shape, y_label.shape

((730, 10), (730, 1))