In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import glob
import shutil
import copy
import csv

import time
import faiss
import numpy as np
import torch
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
class GPSDataset(Dataset):
    def __init__(self, metadata, root_dir,transform1=None, transform2=None):
        self.metadata = pd.read_csv(metadata).values
        self.root_dir = root_dir
        self.transform1 = transform1
        self.transform2 = transform2

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.metadata[idx][0])
        image =  Image.open(img_name).convert('RGB')
        if self.transform1:
            img1 = self.transform1(image)
        if self.transform2:
            img2 = self.transform2(image)
            return img1, img2, idx
                
        return img1, idx

class AUGLoss(nn.Module):
    def __init__(self):
        super(AUGLoss, self).__init__()

    def forward(self, x1, x2):
        b = (x1 - x2)
        b = b*b
        b = b.sum(1)
        b = torch.sqrt(b)
        return b.sum()

# Below codes are from Deep Clustering for Unsupervised Learning of Visual Features github code        
def preprocess_features(npdata, pca=15):
    _, ndim = npdata.shape
    npdata =  npdata.astype('float32')

    # Apply PCA-whitening with Faiss
    mat = faiss.PCAMatrix (ndim, pca, eigen_power=-0.5)
    mat.train(npdata)
    assert mat.is_trained
    npdata = mat.apply_py(npdata)

    # L2 normalization
    row_sums = np.linalg.norm(npdata, axis=1)
    npdata = npdata / row_sums[:, np.newaxis]

    return npdata

def cluster_assign(images_lists, dataset):
    assert images_lists is not None
    pseudolabels = []
    image_indexes = []
    for cluster, images in enumerate(images_lists):
        image_indexes.extend(images)
        pseudolabels.extend([cluster] * len(images))

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    t = transforms.Compose([transforms.RandomResizedCrop(224),
                            transforms.RandomHorizontalFlip(),
                            transforms.ToTensor(),
                            normalize])

    return ReassignedDataset(image_indexes, pseudolabels, dataset, t)


def run_kmeans(x, nmb_clusters):
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

    # Change faiss seed at each k-means so that the randomly picked
    # initialization centroids do not correspond to the same feature ids
    # from an epoch to another.
    clus.seed = np.random.randint(1234)

    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    stats = clus.iteration_stats
    losses = np.array([stats.at(i).obj for i in range(stats.size())])
    print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]


def compute_features(dataloader, model, N, batch_size):
    model.eval()
    # discard the label information in the dataloader
    for i, (inputs, _) in enumerate(dataloader):
        inputs = inputs.cuda()
        aux = model(inputs).data.cpu().numpy()
        aux = aux.reshape(-1, 1280)
        if i == 0:
            features = np.zeros((N, aux.shape[1]), dtype='float32')

        aux = aux.astype('float32')
        if i < len(dataloader) - 1:
            features[i * batch_size: (i + 1) * batch_size] = aux
        else:
            features[i * batch_size:] = aux

    return features  


class Kmeans(object):
    def __init__(self, k):
        self.k = k

    def cluster(self, data,pca):
        end = time.time()

        # PCA-reducing, whitening and L2-normalization
        xb = preprocess_features(data,pca)

        # cluster the data
        I, loss = run_kmeans(xb, self.k)
        self.images_lists = [[] for i in range(self.k)]
        label = []
        for i in range(len(data)):
            label.append(I[i])
            self.images_lists[I[i]].append(i)
            
        label = torch.tensor(label).cuda()
        print(label)

        print('k-means time: {0:.0f} s'.format(time.time() - end))

        return loss, label

In [45]:
## label0 nature
convnet = torch.load('/home/haoying/res_zl12_effnet_b0_9.7km/label0_pretrained.pt')
convnet._fc = nn.Identity()
convnet._swish = nn.Identity()
# model = nn.Sequential(*(list(model.children())[:-3])) # strips off last linear layer
convnet = torch.nn.DataParallel(convnet)    
convnet.cuda()
cluster_transform =transforms.Compose([
                  transforms.Resize(256),
                  transforms.CenterCrop(224),
                  transforms.ToTensor(),
                  transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) 

In [46]:
clusterset = GPSDataset('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled0.csv', '/home/haoying/data_zl12/', cluster_transform)
clusterloader = torch.utils.data.DataLoader(clusterset, batch_size=10, shuffle=False, num_workers=0)
deepcluster = Kmeans(5)
features = compute_features(clusterloader, convnet, len(clusterset), 10) 
features.shape

(7794, 1280)

In [47]:
from sklearn.decomposition import PCA

X_ = features
pca = PCA(n_components = 0.80) 
pca.fit(X_)
reduced_X = pca.transform(X_)
reduced_X.shape

(7794, 29)

In [48]:
clustering_loss, p_label = deepcluster.cluster(features,pca=29)
labels = p_label.tolist()
f = open('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled0.csv', 'r', encoding='utf-8')
images = []
rdr = csv.reader(f)
for line in rdr:
    images.append(line[0])
f.close()
images.pop(0)    
nature_cluster = []
for i in range(0, len(images)):
    nature_cluster.append([images[i], labels[i]]) 

k-means loss evolution: [3811.59912109 2174.92114258 2114.59790039 2084.4362793  2063.80126953
 2054.59082031 2050.62060547 2047.18481445 2043.0802002  2039.55114746
 2035.70031738 2030.75158691 2025.87719727 2022.05273438 2019.81616211
 2018.20996094 2016.88012695 2016.07922363 2015.67749023 2015.49743652]
tensor([2, 2, 1,  ..., 2, 3, 4], device='cuda:0')
k-means time: 1 s


In [74]:
## label1 rurul
convnet = torch.load('/home/haoying/res_zl12_effnet_b0_9.7km/label1_pretrained.pt')
convnet._fc = nn.Identity()
convnet._swish = nn.Identity()
# model = nn.Sequential(*(list(model.children())[:-3])) # strips off last linear layer
convnet = torch.nn.DataParallel(convnet)    
convnet.cuda()
cluster_transform =transforms.Compose([
                  transforms.Resize(256),
                  transforms.CenterCrop(224),
                  transforms.ToTensor(),
                  transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) 

In [75]:
clusterset = GPSDataset('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled1.csv', '/home/haoying/data_zl12/', cluster_transform)
clusterloader = torch.utils.data.DataLoader(clusterset, batch_size=10, shuffle=False, num_workers=0)
deepcluster = Kmeans(5)
features = compute_features(clusterloader, convnet, len(clusterset), 10) 
features.shape

(21672, 1280)

In [76]:
X_ = features
pca = PCA(n_components = 0.80) 
pca.fit(X_)
reduced_X = pca.transform(X_)
reduced_X.shape

(21672, 12)

In [77]:
clustering_loss, p_label = deepcluster.cluster(features,pca=12)
labels = p_label.tolist()
f = open('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled1.csv', 'r', encoding='utf-8')
images = []
rdr = csv.reader(f)
for line in rdr:
    images.append(line[0])
f.close()
images.pop(0)    
rurul_cluster = []
for i in range(0, len(images)):
    rurul_cluster.append([images[i], labels[i]+5]) 

k-means loss evolution: [14894.32519531  9277.61425781  8862.14355469  8615.33496094
  8441.54492188  8320.85546875  8249.43652344  8207.28320312
  8176.56982422  8156.80322266  8146.30566406  8140.29882812
  8136.93554688  8134.04736328  8130.34667969  8123.66699219
  8109.65625     8078.0859375   8038.23876953  8013.61132812]
tensor([0, 2, 0,  ..., 0, 1, 0], device='cuda:0')
k-means time: 1 s


In [78]:
## label2 city
convnet = torch.load('/home/haoying/res_zl12_effnet_b0_9.7km/label2_pretrained.pt')
convnet._fc = nn.Identity()
convnet._swish = nn.Identity()
# model = nn.Sequential(*(list(model.children())[:-3])) # strips off last linear layer
convnet = torch.nn.DataParallel(convnet)    
convnet.cuda()
cluster_transform =transforms.Compose([
                  transforms.Resize(256),
                  transforms.CenterCrop(224),
                  transforms.ToTensor(),
                  transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) 

In [79]:
clusterset = GPSDataset('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled2.csv', '/home/haoying/data_zl12/', cluster_transform)
clusterloader = torch.utils.data.DataLoader(clusterset, batch_size=10, shuffle=False, num_workers=0)
deepcluster = Kmeans(3)
features = compute_features(clusterloader, convnet, len(clusterset), 10) 
features.shape

(9500, 1280)

In [80]:
X_ = features
pca = PCA(n_components = 0.80) 
pca.fit(X_)
reduced_X = pca.transform(X_)
reduced_X.shape

(9500, 37)

In [81]:
clustering_loss, p_label = deepcluster.cluster(features,pca=37)
labels = p_label.tolist()
f = open('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled2.csv', 'r', encoding='utf-8')
images = []
rdr = csv.reader(f)
for line in rdr:
    images.append(line[0])
f.close()
images.pop(0)    
city_cluster = []
for i in range(0, len(images)):
    city_cluster.append([images[i], labels[i]+10]) 

k-means loss evolution: [8856.29003906 5882.54736328 5577.25244141 5484.26269531 5430.47412109
 5407.3671875  5401.48046875 5399.93017578 5399.75976562 5399.71289062
 5399.70361328 5399.70507812 5399.70507812 5399.70507812 5399.70507812
 5399.70507812 5399.70507812 5399.70507812 5399.70507812 5399.70507812]
tensor([1, 1, 2,  ..., 1, 1, 2], device='cuda:0')
k-means time: 1 s


In [3]:
# def extract_city_cluster():
#     convnet = torch.load('/home/haoying/res_zl12_effnet_b0_9.7km/label2_pretrained.pt')
#     convnet = torch.nn.DataParallel(convnet)    
#     convnet.cuda()
#     cluster_transform =transforms.Compose([
#                       transforms.Resize(256),
#                       transforms.CenterCrop(224),
#                       transforms.ToTensor(),
#                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])    
    
#     clusterset = GPSDataset('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled2.csv', '/home/haoying/data_zl12/', cluster_transform)
#     clusterloader = torch.utils.data.DataLoader(clusterset, batch_size=10, shuffle=False, num_workers=0)
    
#     deepcluster = Kmeans(5)
#     features = compute_features(clusterloader, convnet, len(clusterset), 10) 
#     clustering_loss, p_label = deepcluster.cluster(features,pca=11)
#     labels = p_label.tolist()
#     f = open('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled2.csv', 'r', encoding='utf-8')
#     images = []
#     rdr = csv.reader(f)
#     for line in rdr:
#         images.append(line[0])
#     f.close()
#     images.pop(0)    
#     city_cluster = []
#     for i in range(0, len(images)):
#         city_cluster.append([images[i], labels[i]]) 
        
#     return city_cluster

# def extract_rural_cluster():
#     convnet = torch.load('/home/haoying/res_zl12_effnet_b0_9.7km/label1_pretrained.pt')
#     convnet = torch.nn.DataParallel(convnet)    
#     convnet.cuda()
#     cluster_transform =transforms.Compose([
#                       transforms.Resize(256),
#                       transforms.CenterCrop(224),
#                       transforms.ToTensor(),
#                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])    
    
#     clusterset = GPSDataset('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled1.csv', '/home/haoying/data_zl12/', cluster_transform)
#     clusterloader = torch.utils.data.DataLoader(clusterset, batch_size=10, shuffle=False, num_workers=0)
    
#     deepcluster = Kmeans(5)
#     features = compute_features(clusterloader, convnet, len(clusterset), 10) 
#     clustering_loss, p_label = deepcluster.cluster(features,pca=6)
#     labels = p_label.tolist()
#     f = open('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled1.csv', 'r', encoding='utf-8')
#     images = []
#     rdr = csv.reader(f)
#     for line in rdr:
#         images.append(line[0])
#     f.close()
#     images.pop(0)    
#     rural_cluster = []
#     for i in range(0, len(images)):
#         rural_cluster.append([images[i], labels[i] + 5])
        
#     return rural_cluster

# def extract_nature_cluster():
#     convnet = torch.load('/home/haoying/res_zl12_effnet_b0_9.7km/label0_pretrained.pt')
#     convnet = torch.nn.DataParallel(convnet)    
#     convnet.cuda()
#     cluster_transform =transforms.Compose([
#                       transforms.Resize(256),
#                       transforms.CenterCrop(224),
#                       transforms.ToTensor(),
#                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])    
    
#     clusterset = GPSDataset('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled0.csv', '/home/haoying/data_zl12/', cluster_transform)
#     clusterloader = torch.utils.data.DataLoader(clusterset, batch_size=10, shuffle=False, num_workers=0)
    
#     deepcluster = Kmeans(8)
#     features = compute_features(clusterloader, convnet, len(clusterset), 10) 
#     clustering_loss, p_label = deepcluster.cluster(features,pca=10)
#     labels = p_label.tolist()
#     f = open('/home/haoying/res_zl12_effnet_b0_9.7km/nightlights_labeled0.csv', 'r', encoding='utf-8')
#     images = []
#     rdr = csv.reader(f)
#     for line in rdr:
#         images.append(line[0])
#     f.close()
#     images.pop(0)    
#     nature_cluster = []
#     for i in range(0, len(images)):
#         nature_cluster.append([images[i], labels[i] + 10])
        
#     return nature_cluster

In [63]:
# city_cluster = extract_city_cluster()
# rural_cluster = extract_rural_cluster()
# nature_cluster = extract_nature_cluster()

In [83]:
total_cluster = city_cluster + rurul_cluster + nature_cluster
cnum = 13
cluster_dir = '/home/haoying/res_zl12_effnet_b0_9.7km/data/'
if not os.path.exists(cluster_dir):
    os.makedirs(cluster_dir)
for i in range(0, cnum + 1):
    os.makedirs(cluster_dir + str(i))

In [84]:
total_cluster = city_cluster + rurul_cluster + nature_cluster
df = pd.DataFrame(total_cluster,columns=['y_x','cluster_id'])

In [85]:
df.groupby('cluster_id')['y_x'].count()

cluster_id
0       411
1      1008
2      5423
3       539
4       413
5     11769
6      3335
7      2561
8      2205
9      1802
10     1396
11     3682
12     4422
Name: y_x, dtype: int64

In [86]:
for i in range(0, cnum):
    path='/home/haoying/res_zl12_effnet_b0_9.7km/data/'+str(i)+'/cluster.csv'
    df[df['cluster_id']==i].to_csv(path, index = False)

In [87]:
df.to_csv('/home/haoying/res_zl12_effnet_b0_9.7km/data/unified.csv', index = False)