# **SEMI-SUPERVISED LEARNING ON OVERHEAD MNIST DATASET**

In [None]:
!pip install ipython-autotime --quiet
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 6.95 s (started: 2024-04-21 16:12:31 +00:00)


#  Import Necessary Models

In [None]:
# import necessary models
from tqdm import tqdm
import pandas as pd
import numpy as np
from tensorflow import keras
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.linear_model import LogisticRegression

time: 993 µs (started: 2024-04-21 16:12:38 +00:00)


# Load Overhead MNIST data

In [None]:
# Save files as dataframes

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

time: 909 ms (started: 2024-04-21 16:12:38 +00:00)


# Data Preprocessing

In [None]:
X_train = train.iloc[:, 1:].to_numpy()
y_train = train.iloc[:, 0].to_numpy()

time: 2.18 ms (started: 2024-04-21 16:12:39 +00:00)


In [None]:
X_test = test.iloc[:, 1:].to_numpy()
y_test = test.iloc[:, 0].to_numpy()

time: 1.88 ms (started: 2024-04-21 16:12:39 +00:00)


In [None]:
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

time: 53.8 ms (started: 2024-04-21 16:12:39 +00:00)


# A function to print a particular image

In [None]:
get_original_array = lambda arr: np.array([int(x * 255) for x in arr], dtype=np.uint8)

def print_image(i, data='train'):
  if data == 'train':
    return get_original_array(X_train[i]).reshape(28,28)
  else:
    return get_original_array(X_test[i]).reshape(28,28)

#example
print_image(0)

time: 15.5 ms (started: 2024-04-21 16:12:44 +00:00)


# Standard Logistic Regression

In [None]:
# Standard logistic model has a very high accuracy for the dataset.

logistic_regressor_clf = LogisticRegression(random_state=69, max_iter=10000).fit(X_train, y_train)
logistic_regressor_clf.score(X_test, y_test)

0.4676056338028169

time: 1min 22s (started: 2024-04-21 16:12:54 +00:00)


The standard logistic regression model gives an accuracy score of 0.4676 which is pretty less.

# **Now, we will perform semi-supervised learning on the Fashion MNIST dataset and compare the accuracy scores.**

1.   First, we will take 40 random samples and do the logistic regression.
2.   Next, we will use k-means clustering to get 40 clusters and take the nearest points as centroids. Then we will use those 40 data-points to train the logistic regression classifier.
3. Then we will propagate the labelling to the whole cluster and do the same.
4. Repeat the same, but only propagate to the 20% of the dataset now.



# Perform Logistic Regression On Randomly Selected 40 Samples

In [None]:
# Randomly select 40 samples from the training data
idx = np.random.choice(X_train.shape[0], 40, replace=False)

# Get the selected samples and their corresponding labels
X_sample = X_train[idx]
y_sample = y_train[idx]

logistic_regressor_clf = LogisticRegression(random_state=69, max_iter=10000).fit(X_sample, y_sample)
logistic_regressor_clf.score(X_test, y_test)

0.20093896713615023

time: 975 ms (started: 2024-04-21 16:29:25 +00:00)


Note that a pretty low accuracy score of 0.2009 is obtained.

# Perform Logistic Regression On 40 Cluster Representative Points

In [None]:
# Use KMeans to select 40 representative samples and then train the classifier

kmeans = MiniBatchKMeans(n_clusters=40, random_state=42)
kmeans.fit(X_train)

cluster_centers = kmeans.cluster_centers_

nearest_points_to_cluster = [-1]*40
distance_from_nearest_points_to_cluster = [10e9]*40

# Select those representative points
for i, cluster_center in enumerate(cluster_centers):
  for j, data_point in enumerate(X_train):
    distance = np.linalg.norm(cluster_center-data_point)
    if distance < distance_from_nearest_points_to_cluster[i]:
      distance_from_nearest_points_to_cluster[i] = distance
      nearest_points_to_cluster[i] = j

X_sample_ = X_train[nearest_points_to_cluster]
y_sample_ = y_train[nearest_points_to_cluster]

# Train and test the logistic regression classifier
logistic_regressor_clf = LogisticRegression(random_state=69, max_iter=10000).fit(X_sample_, y_sample_)
logistic_regressor_clf.score(X_test, y_test)



0.2516431924882629

time: 18.4 s (started: 2024-04-21 16:34:36 +00:00)


# Perform Logistic Regression On 40 Cluster Representative Points After Propagating the Labels

In [None]:
# Use KMeans to select 40 representative samples, propagate the labels and then train the classifier

kmeans = MiniBatchKMeans(n_clusters=40, random_state=42)
kmeans.fit(X_train)

cluster_centers = kmeans.cluster_centers_
datapoints_labels = kmeans.labels_

# Nearest points of each cluster
nearest_points_to_cluster = [-1]*40
distance_from_nearest_points_to_cluster = [10e9]*40

# Mapping between labels and nearest points
nearest_point_of_a_label = [-1]*40

# Select those representative points
for i, cluster_center in enumerate(cluster_centers):
  for j, data_point in enumerate(X_train):
    distance = np.linalg.norm(cluster_center-data_point)
    if distance < distance_from_nearest_points_to_cluster[i]:
      distance_from_nearest_points_to_cluster[i] = distance
      nearest_points_to_cluster[i] = j
      nearest_point_of_a_label[datapoints_labels[j]] = j

y_propagated = [-1]*(X_train.shape[0])
for i, datapoint in enumerate(X_train):
  cluster_label = datapoints_labels[i]
  y_propagated[i] = y_train[nearest_point_of_a_label[cluster_label]]

# Train and test the logistic regression classifier
logistic_regressor_clf = LogisticRegression(random_state=69, max_iter=10000).fit(X_train, y_propagated)
logistic_regressor_clf.score(X_test, y_test)



0.27417840375586855

time: 2min 54s (started: 2024-04-21 16:30:19 +00:00)


# Perform Logistic Regression On 40 Cluster Representative Points After Propagating the Labels Partially

In [None]:
# Use KMeans to select 40 representative samples, propagate the labels partially and then train the classifier

kmeans = MiniBatchKMeans(n_clusters=40, random_state=42)
kmeans.fit(X_train)

cluster_centers = kmeans.cluster_centers_
datapoints_labels = kmeans.labels_

# Nearest points of each cluster
nearest_points_to_cluster = [-1]*40
distance_from_nearest_points_to_cluster = [10e9]*40

# Mapping between labels and nearest points
nearest_point_of_a_label = [-1]*40

# Select those representative points
for i in tqdm(range(len(cluster_centers))):
  for j, data_point in enumerate(X_train):
    distance = np.linalg.norm(cluster_centers[i]-data_point)
    if distance < distance_from_nearest_points_to_cluster[i]:
      distance_from_nearest_points_to_cluster[i] = distance
      nearest_points_to_cluster[i] = j
      nearest_point_of_a_label[datapoints_labels[j]] = j

closest_points_from_centroid = np.array([[1]*40]*40)

for i in tqdm(range(len(nearest_points_to_cluster))):
  distance_point_pair = np.empty((0, 2))

  for j, data_point in enumerate(X_train):

    dist = np.linalg.norm(data_point-X_train[nearest_points_to_cluster[i]])
    to_append = np.array([dist, j])
    distance_point_pair = np.concatenate((distance_point_pair, [to_append]))

  sorted_indices = np.argsort(distance_point_pair[:, 0])
  distance_point_pair = (distance_point_pair[sorted_indices])[:40]

  closest_points_idx = distance_point_pair[:, 1]
  closest_points_from_centroid[i] = closest_points_idx

X_sample = []
y_propagated = []

for i, id in enumerate(closest_points_from_centroid):
  for j in id:
    X_sample.append(X_train[j])
    y_propagated.append(y_train[nearest_points_to_cluster[i]])

# Train and test the logistic regression classifier
logistic_regressor_clf = LogisticRegression(random_state=37, max_iter=10000).fit(X_sample, y_propagated)
logistic_regressor_clf.score(X_test, y_test)

100%|██████████| 40/40 [00:06<00:00,  5.74it/s]
100%|██████████| 40/40 [00:10<00:00,  3.97it/s]


0.26291079812206575

time: 46.7 s (started: 2024-04-21 16:51:02 +00:00)


## Report:

- NOTE THAT THE NUMBER OF CLUSTERS TO BE MADE WAS CHOSEN BASED ON THE SIZE OF THE DATASET.
- Standard logistic model has a moderate accuracy for the dataset. (`0.4676056338028169`)
- Now, we choose randomly 40 datapoints (out of 60000) and use standard logistic regression to get (`0.20093896713615023`). The performance was poor.
- Then, instead of choosing randomly, we cluster the dataset into 40 clusters and used the nearest points to the centroids as the representative points to train the standard logistic model. The accuracy was `0.2516431924882629`. The accuracy is greatly improved as we are now using the best of its kind to train.
- Next, we propagate the labels to every cluster point to get the accuracy as `0.27417840375586855`. Performance improves as there are more training points.
- Now, instead of propagating to all the points, we propagate to the 20% data (`0.26291079812206575`). Surprisingly, the performance is still moderate implying labelling small dataset accordingly (semi-supervised learning) performs greatly.