In [2]:
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import SpectralClustering
import numpy as np
import pickle
import random 

if 1:
    from  google.colab  import  drive
    drive.mount('/content/gdrive')
    my_directory = 'gdrive/My Drive/Colab Notebooks/mvdigits/'
else:
    my_directory = ''

# Define six views from digits dataset
view_one = np.loadtxt(my_directory + 'mfeat-fac') # (2000, 216)
view_two = np.loadtxt(my_directory + 'mfeat-fou') # (2000, 76)
view_three = np.loadtxt(my_directory + 'mfeat-kar') # (2000, 64)
view_four = np.loadtxt(my_directory + 'mfeat-mor') # (2000, 6)
view_five = np.loadtxt(my_directory + 'mfeat-pix') # (2000, 240)
view_six = np.loadtxt(my_directory + 'mfeat-zer') # (2000, 47)

# Dimension of six original views
dim_one = view_one.shape[1]
dim_two = view_two.shape[1]
dim_three = view_three.shape[1]
dim_four = view_four.shape[1]
dim_five = view_five.shape[1]
dim_six = view_six.shape[1]

# Horizontally concatenate views to select features from
#data_one = np.hstack((view_one[:,:50], view_two[:,:50])) # We are getting 50 from each view, 100 total
data_one = np.hstack((view_two, view_three))

# Dimension of concatenated views
data_one_dim = data_one.shape[1] 

Mounted at /content/gdrive


In [None]:
# Define our regressor
regr = RandomForestRegressor(max_depth = 5, random_state = 0)

# Feature selection method
all_groups = [] # Defined as python list to make use of append method for a multi-dimensional list
total_features = 300
group_size = 10
array_length = 0
n = 2 # Number of features to be selected per iteration

while len(all_groups) < int(total_features / group_size): # While there are still more features to select
  start_index = random.randint(0, (data_one_dim - 1)) # start_index for each group will be a random index in data_one
  output_vars = np.array([start_index]) 
  while len(output_vars) < group_size: # Nested : while group has not reached the specified group size
    index_dict = [i for i in np.arange(data_one_dim) if i not in output_vars] # Acts as a means to help fix indexing issues
    input_vars = np.setdiff1d(np.arange(data_one_dim), output_vars) # input_vars will not contain values that are already in output_vars

    X = data_one[:,input_vars] # Training samples
    y = data_one[:,output_vars] # Target values
    if len(output_vars) == 1: # Fixes potential dimensionality issue
      y = y.flatten()

    regr.fit(X, y) # Fit data to regressor
    result = permutation_importance(regr, X, y, n_repeats = 6, random_state = 0) # Store result of regressor's importance scores
    sorted = result.importances_mean.argsort()[-1:-(n + 1):-1] # Sort the importances from highest to lowest scores and take only n variables
    for i in range(len(sorted)):
      if len(output_vars) < group_size:
        original_index = index_dict[sorted[i]] # Use dict to retrieve the actual index of the selected feature
        output_vars = np.append(output_vars, original_index) # Push selected feature to array output_vars
    
  all_groups.append(output_vars) # Push output_vars to array all_groups
  array_length = len(output_vars) # Set array_length to the length of array output_vars
  print(str(output_vars))

all_groups = np.asarray(all_groups) # Change all_groups from python list to nparray

# Serialize all_groups
with open("gdrive/My Drive/Colab Notebooks/all_groups_file.txt", "wb") as all_groups_file:
  pickled_groups = pickle.dump(all_groups, all_groups_file)

[18 11 12  4 72 68  6 13 73  7]
[136 119  96 123  76   1  77   6  84  85]
[116  76  77 129  96   6  84   1 101  72]
[115 109 113 119 120 118 127  84 101 111]
[112   4  85  76  96   6  84   1  72 101]
[62 59 65 72 68 70  7  2  4 73]
[125  84 119  91  77 103  75   6  78   4]
[24 52 20 34 44 49 10  1 68 14]
[114  78  83  89  73  81   6  75 101  76]
[111 101  78 123  73  81  87  75   0  76]
[38 28 42 43 32 33 21 22 37 17]
[ 91 103 105 104  79 119  78  73  85  80]
[128 120  77 127   6  78  84  73  81 117]
[39 35 29 34 30 44 25 40 48 19]
[101 111   6  78  73  81  77 117  75   1]
[68 73 65  4 75 17 72 71  6  1]
[135 112 106 124  76  87  78  96 101   4]
[124 139  87 117 101  73  76  96   6   4]
[  6  13   8  75  73   4 100  86  98  80]
[50 53 47 62 65 59 72 68 70  7]
[71 73  4 68 75 72 70  5  1  6]
[48 29 45 44 34 75 25 73  6  0]
[56 53 19  0  7 69  9  2 72  4]
[133  76 127  85  96   6   4   1  84  71]
[107  79  99 103  76  74  80  96  85   4]
[138 119 124 108  84 115 127  77  75  98]
[26 36 3

In [3]:
# Deserialize all_groups
with open("gdrive/My Drive/Colab Notebooks/all_groups_file.txt", "rb") as all_groups_file:
  unpickled_groups = pickle.load(all_groups_file)

for row in unpickled_groups:
  print(row)

[18 11 12  4 72 68  6 13 73  7]
[136 119  96 123  76   1  77   6  84  85]
[116  76  77 129  96   6  84   1 101  72]
[115 109 113 119 120 118 127  84 101 111]
[112   4  85  76  96   6  84   1  72 101]
[62 59 65 72 68 70  7  2  4 73]
[125  84 119  91  77 103  75   6  78   4]
[24 52 20 34 44 49 10  1 68 14]
[114  78  83  89  73  81   6  75 101  76]
[111 101  78 123  73  81  87  75   0  76]
[38 28 42 43 32 33 21 22 37 17]
[ 91 103 105 104  79 119  78  73  85  80]
[128 120  77 127   6  78  84  73  81 117]
[39 35 29 34 30 44 25 40 48 19]
[101 111   6  78  73  81  77 117  75   1]
[68 73 65  4 75 17 72 71  6  1]
[135 112 106 124  76  87  78  96 101   4]
[124 139  87 117 101  73  76  96   6   4]
[  6  13   8  75  73   4 100  86  98  80]
[50 53 47 62 65 59 72 68 70  7]
[71 73  4 68 75 72 70  5  1  6]
[48 29 45 44 34 75 25 73  6  0]
[56 53 19  0  7 69  9  2 72  4]
[133  76 127  85  96   6   4   1  84  71]
[107  79  99 103  76  74  80  96  85   4]
[138 119 124 108  84 115 127  77  75  98]
[26 36 3

In [4]:
columns, rows = int(total_features / group_size), int(total_features / group_size) # Columns and rows of matrix_one
i, j = 0, 0
matrix_one = [[0 for x in range(columns)] for y in range(rows)]

for i in range(rows):
  for j in range(columns):
    matrix_one[i][j] = len(np.intersect1d(unpickled_groups[i], unpickled_groups[j])) / group_size # Make a percentage as a ratio of intersections / group size

# Serialize matrix_one
with open("gdrive/My Drive/Colab Notebooks/matrix_one_file.txt", "wb") as matrix_one_file:
  pickled_matrix_one = pickle.dump(matrix_one, matrix_one_file)

NameError: ignored

In [5]:
# Deserialize matrix_one
with open("gdrive/My Drive/Colab Notebooks/matrix_one_file.txt", "rb") as matrix_one_file:
  unpickled_matrix_one = pickle.load(matrix_one_file)

for row in unpickled_matrix_one:
  print(row)

[1.0, 0.1, 0.2, 0.0, 0.3, 0.5, 0.2, 0.1, 0.2, 0.1, 0.0, 0.1, 0.2, 0.0, 0.2, 0.5, 0.1, 0.3, 0.4, 0.3, 0.5, 0.2, 0.3, 0.2, 0.1, 0.0, 0.3, 0.3, 0.2, 0.0, 0.3, 0.2, 0.0, 0.3, 0.1, 0.0, 0.2, 0.6, 0.2, 0.3, 0.4, 0.3, 0.2, 0.4, 0.3, 0.4, 0.2, 0.3, 0.2, 0.2, 0.0, 0.3, 0.1, 0.3, 0.2, 0.2, 0.0, 0.1, 0.5, 0.2]
[0.1, 1.0, 0.6, 0.2, 0.6, 0.0, 0.4, 0.1, 0.2, 0.2, 0.0, 0.2, 0.3, 0.0, 0.3, 0.2, 0.2, 0.3, 0.1, 0.0, 0.2, 0.1, 0.0, 0.6, 0.3, 0.3, 0.0, 0.6, 0.4, 0.0, 0.6, 0.1, 0.0, 0.6, 0.5, 0.0, 0.2, 0.1, 0.6, 0.0, 0.1, 0.0, 0.3, 0.1, 0.1, 0.4, 0.6, 0.3, 0.1, 0.4, 0.3, 0.5, 0.2, 0.5, 0.6, 0.1, 0.0, 0.7, 0.1, 0.1]
[0.2, 0.6, 1.0, 0.2, 0.7, 0.1, 0.3, 0.1, 0.3, 0.2, 0.0, 0.0, 0.3, 0.0, 0.4, 0.3, 0.3, 0.4, 0.1, 0.1, 0.3, 0.1, 0.1, 0.5, 0.2, 0.2, 0.0, 0.5, 0.3, 0.0, 0.5, 0.1, 0.0, 0.7, 0.5, 0.0, 0.3, 0.2, 0.5, 0.0, 0.2, 0.1, 0.3, 0.2, 0.2, 0.5, 0.8, 0.4, 0.2, 0.4, 0.2, 0.5, 0.2, 0.5, 0.8, 0.1, 0.0, 0.6, 0.2, 0.1]
[0.0, 0.2, 0.2, 1.0, 0.2, 0.0, 0.2, 0.0, 0.1, 0.2, 0.0, 0.1, 0.3, 0.0, 0.2, 0.0, 0.1, 0.1, 0.0, 0

In [None]:
columns, rows = data_one_dim, data_one_dim # Columns and rows of matrix_two
num_of_groups = int(total_features / group_size)
i, j = 0, 0
group_i, group_j = 0, 0
count = [[0 for x in range(columns)] for y in range(rows)] 
matrix_two = [[0 for x in range(columns)] for y in range(rows)]
 
for i in range(rows):
  for j in range(columns):
    for group_i in range(num_of_groups):
      for group_j in range(num_of_groups):
        if i in unpickled_groups[group_i] and j in unpickled_groups[group_j]:
          matrix_two[i][j] += unpickled_matrix_one[group_i][group_j]
          count[i][j] += 1

for i in range(rows):
  for j in range(columns):
    if count[i][j] != 0: # Avoids potential divide by 0 error
      matrix_two[i][j] /= count[i][j]     

# Serialize matrix_two
with open("gdrive/My Drive/Colab Notebooks/matrix_two_file.txt", "wb") as matrix_two_file:
  pickled_matrix_two = pickle.dump(matrix_two, matrix_two_file)

In [6]:
# Deserialize matrix_two
with open("gdrive/My Drive/Colab Notebooks/matrix_two_file.txt", "rb") as matrix_two_file:
  unpickled_matrix_two = pickle.load(matrix_two_file)

for row in unpickled_matrix_two:
  print(row)

[0.40625000000000006, 0.27175925925925937, 0.3049999999999998, 0.4, 0.22327586206896577, 0.27249999999999985, 0.28291666666666676, 0.27638888888888896, 0.1625, 0.25, 0.07500000000000002, 0.21875000000000006, 0.2833333333333333, 0.23125, 0.0875, 0, 0.1875, 0.11875000000000001, 0.2375, 0.19375, 0.06250000000000001, 0.0, 0.043750000000000004, 0.1875, 0.06250000000000001, 0.12249999999999998, 0.05833333333333334, 0.06562500000000002, 0.0, 0.12249999999999998, 0.07916666666666668, 0, 0.06250000000000001, 0.0, 0.11666666666666664, 0.1, 0.07291666666666667, 0.05250000000000001, 0.0, 0.1, 0.07, 0.08750000000000002, 0.05250000000000001, 0.0, 0.11666666666666664, 0.21875, 0, 0.15000000000000002, 0.1708333333333333, 0.0875, 0.15000000000000002, 0, 0.06250000000000001, 0.2041666666666666, 0, 0, 0.31250000000000006, 0, 0, 0.17812499999999995, 0, 0.175, 0.17812499999999995, 0, 0, 0.19999999999999996, 0, 0.30000000000000004, 0.18000000000000022, 0.2625, 0.2839285714285714, 0.23055555555555557, 0.3005

In [None]:
columns, rows = data_one_dim, data_one_dim # Columns and rows of distance_matrix
i, j = 0, 0
distance_matrix = [[0 for x in range(columns)] for y in range(rows)]

for i in range(rows):
  for j in range(columns):
    distance_matrix[i][j] = 1 - unpickled_matrix_two[i][j]   

# Serialize matrix_two
with open("gdrive/My Drive/Colab Notebooks/distance_matrix.txt", "wb") as distance_matrix_file:
  pickled_distance_matrix = pickle.dump(distance_matrix, distance_matrix_file)

In [7]:
# Deserialize distance_matrix
with open("gdrive/My Drive/Colab Notebooks/distance_matrix.txt", "rb") as distance_matrix_file:
  unpickled_distance_matrix = pickle.load(distance_matrix_file)

for row in unpickled_distance_matrix:
  print(row)

[0.59375, 0.7282407407407406, 0.6950000000000002, 0.6, 0.7767241379310342, 0.7275000000000001, 0.7170833333333333, 0.723611111111111, 0.8375, 0.75, 0.9249999999999999, 0.78125, 0.7166666666666667, 0.76875, 0.9125, 1, 0.8125, 0.88125, 0.7625, 0.80625, 0.9375, 1.0, 0.95625, 0.8125, 0.9375, 0.8775000000000001, 0.9416666666666667, 0.934375, 1.0, 0.8775000000000001, 0.9208333333333333, 1, 0.9375, 1.0, 0.8833333333333333, 0.9, 0.9270833333333334, 0.9475, 1.0, 0.9, 0.9299999999999999, 0.9125, 0.9475, 1.0, 0.8833333333333333, 0.78125, 1, 0.85, 0.8291666666666667, 0.9125, 0.85, 1, 0.9375, 0.7958333333333334, 1, 1, 0.6875, 1, 1, 0.821875, 1, 0.825, 0.821875, 1, 1, 0.8, 1, 0.7, 0.8199999999999998, 0.7375, 0.7160714285714286, 0.7694444444444444, 0.6994047619047616, 0.7334999999999999, 0.8875, 0.71875, 0.6983695652173914, 0.7410714285714286, 0.74375, 0.90625, 0.8833333333333333, 0.7, 0.625, 0.6937500000000001, 0.7462500000000001, 0.7509615384615385, 0.7541666666666667, 0.6725, 1, 0.725, 0.725, 0.82

In [11]:
from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering().fit(unpickled_distance_matrix)
labels = clustering.labels_

cluster_one = []
cluster_two = []
for i in range(len(labels)):
  if labels[i] == 0:
    cluster_one.append(i)
  elif labels[i] == 1:
    cluster_two.append(i)

print(labels)
   
print(len(cluster_one))
print(len(cluster_two))

print(cluster_one)
print(cluster_two)

[1 1 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0
 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 0 1]
81
59
[2, 3, 5, 7, 9, 10, 12, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 88, 93, 94, 95, 97, 98, 108, 109, 110, 113, 115, 118, 120, 121, 122, 130, 132, 134, 138]
[0, 1, 4, 6, 8, 11, 13, 18, 67, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 89, 90, 91, 92, 96, 99, 100, 101, 102, 103, 104, 105, 106, 107, 111, 112, 114, 116, 117, 119, 123, 124, 125, 126, 127, 128, 129, 131, 133, 135, 136, 137, 139]
