## Using the Co-Training Classifier for 2-View Classification

In [1]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from multiview.cotraining.ctclassifier import CTClassifier
from multiview.datasets.base import load_UCImultifeature

### Load the UCI Multiple Features Dataset as an Example
To simulate a semi-supervised learning scenario, randomly remove many labels.

In [2]:
data, labels = load_UCImultifeature(select_labeled=[0,1])

# Use only the first 2 views as an example
View0, View1 = data[0], data[1]

# Split both views into testing and training
View0_train, View0_test, labels_train, labels_test = train_test_split(View0, labels, test_size=0.33, random_state=42)
View1_train, View1_test, labels_train, labels_test = train_test_split(View1, labels, test_size=0.33, random_state=42)

# Randomly remove all but 4 of the labels
np.random.seed(6)
remove_idx = np.random.rand(len(labels_train),) < 0.98
labels_train[remove_idx] = np.nan
not_removed = np.where(remove_idx==False)
print(labels_train[not_removed])

[ 0.  0.  1.  1.]


### Co-Training on 2 Views vs. Single View Semi-Supervised Learning
Here, we use the default co-training classifier, which uses Gaussian naive bayes classifiers for both views. We compare its performance to the single-view semi-supervised setting with the same basic classifiers.

In [3]:
############## Single view semi-supervised learning ##############
#-----------------------------------------------------------------
gnb0 = GaussianNB()
gnb1 = GaussianNB()

# Train on only the examples with labels
gnb0.fit(View0_train[not_removed,:].squeeze(), labels_train[not_removed])
y_pred0 = gnb0.predict(View0_test)
gnb1.fit(View1_train[not_removed,:].squeeze(), labels_train[not_removed])
y_pred1 = gnb1.predict(View1_test)

print("Single View Accuracy on First View: {0:.3f}\n".format(accuracy_score(labels_test, y_pred0)))
print("Single View Accuracy on Second View: {0:.3f}\n".format(accuracy_score(labels_test, y_pred1)))

######### Multi-view co-training semi-supervised learning #########
#------------------------------------------------------------------
# Train a CTClassifier on all the labeled and unlabeled training data
ctc = CTClassifier()
ctc.fit([View0_train, View1_train], labels_train)
y_pred_ct = ctc.predict([View0_test, View1_test])

print("Co-Training Accuracy on 2 Views: {0:.3f}".format(accuracy_score(labels_test, y_pred_ct)))

Single View Accuracy on First View: 0.803

Single View Accuracy on Second View: 0.864

Co-Training Accuracy on 2 Views: 0.970


### Select Different Base Classifiers for the Views and Change the CTClassifier fit() parameters
Now, we use a random forest classifier with different attributes for each view. 
Furthermore, we manually select the number of positive (p) and negative (n) examples chosen each round in the co-training process, and we define the unlabeled pool size to draw them from and the number of iterations of training to perform.

In [4]:
############## Single view semi-supervised learning ##############
#-----------------------------------------------------------------
rfc0 = RandomForestClassifier(n_estimators=8, bootstrap=True)
rfc1 = RandomForestClassifier(n_estimators=5, bootstrap=False)

# Train on only the examples with labels
rfc0.fit(View0_train[not_removed,:].squeeze(), labels_train[not_removed])
y_pred0 = rfc0.predict(View0_test)
rfc1.fit(View1_train[not_removed,:].squeeze(), labels_train[not_removed])
y_pred1 = rfc1.predict(View1_test)

print("Single View Accuracy on First View: {0:.3f}\n".format(accuracy_score(labels_test, y_pred0)))
print("Single View Accuracy on Second View: {0:.3f}\n".format(accuracy_score(labels_test, y_pred1)))

######### Multi-view co-training semi-supervised learning #########
#------------------------------------------------------------------
rfc0 = RandomForestClassifier(n_estimators=8, bootstrap=True)
rfc1 = RandomForestClassifier(n_estimators=5, bootstrap=False)
ctc = CTClassifier(rfc0, rfc1)
ctc.fit([View0_train, View1_train], labels_train, p=2, n=2, unlabeled_pool_size=20, num_iter=100)
y_pred_ct = ctc.predict([View0_test, View1_test])

print("Co-Training Accuracy: {0:.3f}".format(accuracy_score(labels_test, y_pred_ct)))

Single View Accuracy on First View: 0.674

Single View Accuracy on Second View: 0.939

Co-Training Accuracy: 0.977


### Get the prediction probabilities for all the examples

In [5]:
y_pred_proba = ctc.predict_proba([View0_test, View1_test])
print("Full shape = " + str(y_pred_proba.shape))
print("\nFirst 10 example probabilities:\n")
print(y_pred_proba[:10,:])

Full shape = (132, 2)

First 10 example probabilities:

[[ 0.1     0.9   ]
 [ 0.0625  0.9375]
 [ 0.5     0.5   ]
 [ 0.0625  0.9375]
 [ 1.      0.    ]
 [ 0.7125  0.2875]
 [ 0.1625  0.8375]
 [ 0.875   0.125 ]
 [ 0.      1.    ]
 [ 0.8125  0.1875]]
