This script is used to generate hierarchically named tags to be used in bundled heirarchincal graph visualisation.

In [1]:
import numpy as np
import pandas as pd
import json
import feather
import pickle

In [2]:
dat = feather.read_dataframe("./label_members.feather")

In [3]:
dat.head()

Unnamed: 0,label_id,label_name,count,G0,G1,G2
0,0,Games,859519,0,0,0
1,1,Vehicle,678257,3,1,0
2,2,Video game,518981,0,0,0
3,3,Concert,494707,2,2,1
4,4,Car,371391,3,1,0


In [4]:
assoc = pickle.load(open("./co_mat.pkl", 'rb'))

In [5]:
n = 200
dense = assoc[:n, :n].todense()
diag = np.diag(dense)
total = np.repeat(np.matrix(diag), diag.shape[0], axis=0) + np.repeat(np.matrix(diag).T, diag.shape[0], axis=1)
dense = (dense - diag * np.eye(dense.shape[0], dtype="uint32")) / total
dense = (dense >= 0.1) * 1

In [6]:
dense

matrix([[0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [7]:
dat.shape

(200, 6)

In [8]:
dense.shape

(200, 200)

In [9]:
dat['G3'] = 'G3_0'
dat['G2'] = dat['G2'].apply(lambda s: 'G2_' + str(s))
dat['G1'] = dat['G1'].apply(lambda s: 'G1_' + str(s))
dat['G0'] = dat['G0'].apply(lambda s: 'G0_' + str(s))

In [10]:
dat.head()

Unnamed: 0,label_id,label_name,count,G0,G1,G2,G3
0,0,Games,859519,G0_0,G1_0,G2_0,G3_0
1,1,Vehicle,678257,G0_3,G1_1,G2_0,G3_0
2,2,Video game,518981,G0_0,G1_0,G2_0,G3_0
3,3,Concert,494707,G0_2,G1_2,G2_1,G3_0
4,4,Car,371391,G0_3,G1_1,G2_0,G3_0


In [11]:
feather.write_dataframe(dat, "label_by_group.feather")

In [25]:
fullnames = dat.ix[:, ['G3', 'G2', 'G1', 'G0', 'label_name']].apply(lambda s: ".".join(s.astype(str)), axis=1)

In [26]:
exp = pd.DataFrame({'name': fullnames, 'size':dat['count']})

In [27]:
exp.head()

Unnamed: 0,name,size
0,G3_0.G2_G2_0.G1_G1_0.G0_G0_0.Games,859519
1,G3_0.G2_G2_0.G1_G1_1.G0_G0_3.Vehicle,678257
2,G3_0.G2_G2_0.G1_G1_0.G0_G0_0.Video game,518981
3,G3_0.G2_G2_1.G1_G1_2.G0_G0_2.Concert,494707
4,G3_0.G2_G2_0.G1_G1_1.G0_G0_3.Car,371391


In [28]:
imports = []
for i in range(exp.shape[0]):
    imports += [list(exp.ix[np.ravel(dense[i, :] > 0), 'name'])]

In [29]:
exp['imports'] = imports

In [30]:
exp.head()

Unnamed: 0,name,size,imports
0,G3_0.G2_G2_0.G1_G1_0.G0_G0_0.Games,859519,"[G3_0.G2_G2_0.G1_G1_0.G0_G0_0.Video game, G3_0..."
1,G3_0.G2_G2_0.G1_G1_1.G0_G0_3.Vehicle,678257,"[G3_0.G2_G2_0.G1_G1_1.G0_G0_3.Car, G3_0.G2_G2_..."
2,G3_0.G2_G2_0.G1_G1_0.G0_G0_0.Video game,518981,"[G3_0.G2_G2_0.G1_G1_0.G0_G0_0.Games, G3_0.G2_G..."
3,G3_0.G2_G2_1.G1_G1_2.G0_G0_2.Concert,494707,"[G3_0.G2_G2_1.G1_G1_2.G0_G0_2.Musician, G3_0.G..."
4,G3_0.G2_G2_0.G1_G1_1.G0_G0_3.Car,371391,"[G3_0.G2_G2_0.G1_G1_1.G0_G0_3.Vehicle, G3_0.G2..."


In [31]:
exp.to_json("plot_cluster_0_1.json", orient="records")