In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('/content/Mall_Customers.csv')

# Preprocessing
# Encode Gender
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

# Features and target (we'll artificially create target as clusters to simulate semi-supervised)
features = df[['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Artificial labels: use KMeans to create pseudo-labels
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(features)

X = features.values
y = df['Cluster'].values

# Split into labeled and unlabeled data
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(X, y, test_size=0.9, random_state=42, stratify=y)

# Co-Training setup
clf1 = LogisticRegression()
clf2 = DecisionTreeClassifier(max_depth=3)

# Initial training on small labeled data
clf1.fit(X_labeled, y_labeled)
clf2.fit(X_labeled, y_labeled)

# Co-Training iterations
max_iter = 20
for i in range(max_iter):
    # Predict on unlabeled data
    pred1 = clf1.predict(X_unlabeled)
    pred2 = clf2.predict(X_unlabeled)

    # Find instances where both classifiers agree
    agree_idx = np.where(pred1 == pred2)[0]

    if len(agree_idx) == 0:
        break  # Stop if no agreement

    # Add agreed data to labeled set
    X_new = X_unlabeled[agree_idx]
    y_new = pred1[agree_idx]

    X_labeled = np.vstack((X_labeled, X_new))
    y_labeled = np.concatenate((y_labeled, y_new))

    # Remove added data from unlabeled set
    mask = np.ones(len(X_unlabeled), dtype=bool)
    mask[agree_idx] = False
    X_unlabeled = X_unlabeled[mask]

    # Retrain classifiers
    clf1.fit(X_labeled, y_labeled)
    clf2.fit(X_labeled, y_labeled)

# Final evaluation on remaining labeled data (test)
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.3, random_state=42)
clf_final = LogisticRegression()
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)

print("Final Accuracy:", accuracy_score(y_test, y_pred))

Final Accuracy: 1.0
[1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
