<a href="https://colab.research.google.com/github/Jhansipothabattula/Machine_Learning/blob/main/Day99.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Self-Training

**Self-Training**

Self-Training is a semi-supervised learning approach that leverages a small labeled dataset alongside a larger unlabeled dataset. The model is initially trained on labeled data, and then it makes predictions on the unlabeled data. The confident predictions (those with high certainty) are then added to the labeled dataset, and the process is repeated to improve the mode

In [5]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Generate a synthetic dataset
X, y = make_classification(n_samples=200, n_features=5, random_state=42)
X_labeled, X_unlabeled, y_labeled, _ = train_test_split(X, y, test_size=0.7, random_state=42)

# Initialize and train the model with labeled data
model = RandomForestClassifier(random_state = 42)
model.fit(X_labeled, y_labeled)

# Perform self-training on unlabeled data
for i in range(5):
  if X_unlabeled.shape[0] == 0:
      print(f"Iteration {i+1}: No unlabeled data left to process. Exiting self-training loop.")
      break

  probs = model.predict_proba(X_unlabeled)
  high_confidence_idx = np.where(np.max(probs, axis=1) > 0.9)[0]

  # Only proceed if there are high-confidence predictions
  if len(high_confidence_idx) > 0:
    # Add High-confidence predictions to labeled data
    X_labeled = np.vstack((X_labeled, X_unlabeled[high_confidence_idx]))
    y_labeled = np.hstack([y_labeled, model.predict(X_unlabeled[high_confidence_idx])])

    # Remove confident samples from the unlabeled dataset
    X_unlabeled = np.delete(X_unlabeled, high_confidence_idx, axis=0)

    # Re-train the model on the expanded labeled dataset
    model.fit(X_labeled, y_labeled)
    print(f"Iteration {i+1}: Added {len(high_confidence_idx)} high-confidence samples. Labeled data size: {len(y_labeled)}")
  else:
    print(f"Iteration {i+1}: No high-confidence samples found. Continuing...")

# Final Evaluation on a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: \n", accuracy)


Iteration 1: Added 87 high-confidence samples. Labeled data size: 147
Iteration 2: Added 16 high-confidence samples. Labeled data size: 163
Iteration 3: Added 8 high-confidence samples. Labeled data size: 171
Iteration 4: Added 2 high-confidence samples. Labeled data size: 173
Iteration 5: No high-confidence samples found. Continuing...
Accuracy: 
 0.875
