Note: All code in script is sourced from various internet sources, from mentor Tomasz Popiel and from Alan Chalk at Sabre Insurance.

## Distance Matrix

Contents:

 - Start_: import modules, set directories, load data ('02_df_all.pickle') 
 
 -  Sample non-scaled data
 
 - Scaling (numerical and histogram variables)
 
 - Distance matrix components (Numerical, Categorical, Histogram)
 
 -  Combining into final distance matrix
 
 -  Save ('03_distancematrix_40%_exclude_target.hkl')

Notes:

 - Using more than ?% of the data the kernel 'dies'
 - Exclude target for target analysis

### Start_:

Import modules

In [1]:
import os
import pickle
import gc
import sys

import pandas as pd
import numpy as np

from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.spatial.distance import jensenshannon

from sklearn.preprocessing import normalize

Set directories

In [2]:
print(os.getcwd())

dirRawData = "../RawData/"
dirPData = "../PData/"
dirPOutput = "../POutput/"

C:\Users\hu121\Desktop\GROUP PROJECT\PCode


Load data

In [3]:
fname = dirPData + '02_df_all.pickle' 
with open(fname, 'rb') as f:
    dict_ = pickle.load(f)

df_all = dict_['df_all']
del fname
df_all.shape

(65196, 643)

### Sample non-scaled data

In [4]:
df_all_2 = df_all.sample(frac=0.4, random_state=123)

In [5]:
df_all_2.head()

Unnamed: 0,tq_db11_mean,tq_db12_mean,tq_db13_mean,tq_db14_mean,tq_dt1_mean,tq_dt2_mean,tq_dt3_mean,tq_dt4_mean,tq_db15_mean,tq_db16_mean,...,a_v7_8__97_431,a_v7_8__97_46,a_v7_8__97_647,a_v7_8__97_770,a_v7_8__97_875,a_v7_8__97_909,a_v7_8__99_37,a_v7_8__99_995,a_v7_8__99_996,a_v7_8_other
12437,15.9672,7.3934,1.4918,0.0164,0.0984,0.0328,0.0,0.0164,0.0984,0.0984,...,0,0,0,0,0,0,0,0,0,0
32372,16.3125,4.4375,1.1875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
26155,20.0,6.8462,1.1538,0.0769,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
50441,16.5,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
45622,19.125,9.875,1.75,0.0,0.125,0.0,0.0,0.0,0.375,0.375,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_all_2.shape

(26078, 643)

### Scaling (numerical and histogram variables)

Numerical variables : normalise 

In [7]:
df_scaled = normalize(df_all[dict_['vars_numeric']])
df_scaled = pd.DataFrame(df_scaled, columns=df_all[dict_['vars_numeric']].columns)

In [8]:
for col in dict_['vars_numeric']:
    df_all[col] = df_scaled[col]

Histogram variables : divide by 100 so they sum to 1

In [9]:
for col in dict_['vars_hist']:
    df_all[col] = df_all[col]/100

In [10]:
del df_scaled
gc.collect()

40

### Distance matrix components

Create distance matrices for numerical, categorial and histogram variables separately, then later add together to get final distance matrix

#### Set weights for numerical, categorical and histgoram

In [11]:
weight_num = 1
weight_cat = 1
weight_his = 1

#### Downsample data

In [12]:
# randomly sample 40% of the dataset
df_all = df_all.sample(frac=0.4, random_state=123)

In [13]:
# Exclude varaibles from clustering
vars_notToUse = ['id',
 'tq_dt1_mean',
 'tq_dt2_mean',
 'tq_dt3_mean',
 'tq_dt4_mean',
 'tq_dt1_std',
 'tq_dt2_std',
 'tq_dt3_std',
 'tq_dt4_std']
vars_toUse = [var for var in df_all.columns if var not in vars_notToUse]

In [14]:
df_ds = df_all[vars_toUse]

In [15]:
df_ds.shape

(26078, 634)

### Numerical Variables

Using the 'pdist' function and distance metric "cityblock"   

Considering different types of numerical varaibles separately (mean, std, mode, NMiss and tq_v3, a_v6, a_v9, a_v10, a_v11, a_v12)

In [16]:
vars_num_prefix_1 = ('_mean','_Mean')

In [17]:
vars_num_prefix_2 = ('_std','_StdDev')

In [18]:
vars_num_prefix_3 = ('_Mode', '_NMiss')

In [19]:
vars_num_prefix_4 = ['tq_v3','a_v6', 'a_v9', 'a_v10', 'a_v11', 'a_v12']

In [20]:
### exclude the target variables 

vars_num_separate = []

vars_num_separate.append([col for col in dict_['vars_numeric'] if col.endswith(vars_num_prefix_1) and not col.startswith('tq_dt1') and not col.startswith ('tq_dt2') and not col.startswith('tq_dt3') and not col.startswith ('tq_dt4')])

vars_num_separate.append([col for col in dict_['vars_numeric'] if col.endswith(vars_num_prefix_2) and not col.startswith('tq_dt1') and not col.startswith ('tq_dt2') and not col.startswith('tq_dt3') and not col.startswith ('tq_dt4')])

for prefix in vars_num_prefix_3:
    vars_num_separate.append([col for col in dict_['vars_numeric'] if col.endswith(prefix)])

vars_num_separate.append(vars_num_prefix_4)

In [21]:
# we have a total of 5 types of numerical variables
len(vars_num_separate)

5

Combine the 5 types of numerical variables distance matrices into one distance matrix 'num_dist_mat" and add weights. Have chosen weight 1 so no weight for now. This can be adjusted

In [22]:
num_dist_mat = None
weights = [1, 1, 1, 1, 1]

for idx, number in enumerate(vars_num_separate):
    number_as_matrix = df_ds[number].values
    each_distance_matrix = pdist(number_as_matrix, metric='cityblock')
    del number_as_matrix
    gc.collect()
    print(each_distance_matrix)
    #print(sys.getsizeof(each_distance_matrix)/1e9)
    if num_dist_mat is None:
        num_dist_mat = each_distance_matrix * weights[idx]
    else:
        num_dist_mat = np.sum([num_dist_mat, each_distance_matrix * weights[idx]], axis=0)
    del each_distance_matrix
    gc.collect()        

[0.57837191 0.68884797 0.96088635 ... 2.8401851  2.45359398 2.06431128]
[0.29191958 0.31139068 1.23138086 ... 1.09042186 1.33707156 0.27702029]
[2.30695339 2.09398696 1.39625035 ... 2.69667838 2.41657921 2.45853488]
[0.21567934 0.30144201 0.3539212  ... 1.86786511 1.94797556 0.08143097]
[0.30694788 0.16572437 0.20764071 ... 0.5990497  0.58536194 0.24742121]


In [23]:
num_dist_mat

array([3.69987209, 3.56139199, 4.15007947, ..., 9.09420014, 8.74058224,
       5.12871862])

### Categorical Variables

Using the 'pdist' function and distance metric "hamming"   

Considering different types of categorical variables separately ('a_v1','a_v2','a_v3','a_v4','a_v5','a_v7_8')

In [24]:
vars_cat_prefix = ('a_v1','a_v2','a_v3','a_v4','a_v5','a_v7_8')

In [25]:
vars_cat_separate = []
for prefix in vars_cat_prefix:
    vars_cat_separate.append([col for col in dict_['vars_cat'] if col.startswith(prefix)])

In [26]:
# we have a total of 6 categories
len(vars_cat_separate)

6

Combine the 6 types of categorical variables distance matrices into one distance matrix 'cat_dist_mat" and add weights. Have chosen weight 1 so no weight for now. This can be adjusted

In [27]:
cat_dist_mat = None
weights = [1, 1, 1, 1, 1, 1]

for idx, category in enumerate(vars_cat_separate):
    #print(idx)
    category_as_matrix = df_ds[category].values
    each_distance_matrix = pdist(category_as_matrix, metric='hamming')
    print(each_distance_matrix)
    del category_as_matrix
    gc.collect()
    if cat_dist_mat is None:
        cat_dist_mat = each_distance_matrix * weights[idx]
    else:
        cat_dist_mat = np.sum([cat_dist_mat, each_distance_matrix * weights[idx]], axis=0)
    del each_distance_matrix
    gc.collect()

[0.         0.         0.         ... 0.66666667 0.         0.66666667]
[0.         0.66666667 0.         ... 0.         0.66666667 0.66666667]
[0.66666667 0.         0.66666667 ... 0.         0.66666667 0.66666667]
[0.         0.28571429 0.28571429 ... 0.         0.28571429 0.28571429]
[0.4 0.4 0.4 ... 0.4 0.4 0. ]
[0.00692042 0.00692042 0.00692042 ... 0.00692042 0.00692042 0.00692042]


In [28]:
cat_dist_mat

array([1.07358708, 1.35930137, 1.35930137, ..., 1.07358708, 2.02596803,
       2.2926347 ])

#### Combine num_dist_mat and cat_dist_mat, weights 1
Combine num_dist_mat and cat_dist_mat to avoid storing both

In [29]:
a, b = num_dist_mat.mean(), cat_dist_mat.mean()
print((a, b))

(4.935663857783519, 1.35038143386032)


In [30]:
print(num_dist_mat)
print(cat_dist_mat)
dist_mat = np.sum([weight_num * num_dist_mat, 
                   weight_cat * a / b * cat_dist_mat
                  ], axis=0)
print(dist_mat)

[3.69987209 3.56139199 4.15007947 ... 9.09420014 8.74058224 5.12871862]
[1.07358708 1.35930137 1.35930137 ... 1.07358708 2.02596803 2.2926347 ]
[ 7.62384855  8.52965834  9.11834582 ... 13.0181766  16.14552499
 13.50833193]


In [31]:
# mean values are the same 
(weight_cat * a / b * cat_dist_mat).mean()

4.935663857783513

In [32]:
(weight_num * num_dist_mat).mean()

4.935663857783519

In [33]:
del num_dist_mat, cat_dist_mat
gc.collect()

120

### Histogram Variables

Using the 'pdist' function and distance metric "jensenshannon"   

Considering different histogram variables separately

In [34]:
vars_hist_prefix = ('tq_db24', 'tq_db25', 'tq_db27', 'tq_db28', 'tq_db29', 'tq_db30','tq_db31',
                           'tq_v4', 'tq_v5', 'tq_db32', 'tq_db33', 'tq_db34', 
                           'tq_da12','tq_da13', 'tq_da14',
                           'tq_db35', 'tq_db36','tq_db37', 'tq_db38',
                           'tq_db39', 'tq_db40', 'tq_db41', 'tq_db42',
                           'tq_db43', 'tq_db45', 'tq_db46',
                           'tq_db47', 'tq_db48', 'tq_db49', 'tq_db50')

In [35]:
vars_hist_separate = []
for prefix in vars_hist_prefix:
    vars_hist_separate.append([col for col in dict_['vars_hist'] if col.startswith(prefix)])

In [36]:
# we have a total of 30 histograms
len(vars_hist_separate)

30

Combine the 30 histogram variables distance matrices into one distance matrix 'hist_dist_mat" and add weights. Have chosen weight 1 so no weight for now. This can be adjusted

In [37]:
hist_dist_mat = None
weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

for idx, histogram in enumerate(vars_hist_separate):
    #print(idx)
    histogram_as_matrix = df_ds[histogram].values
    each_distance_matrix = pdist(histogram_as_matrix, metric='jensenshannon')
    print(each_distance_matrix)
    del histogram_as_matrix
    gc.collect()
    if hist_dist_mat is None:
        hist_dist_mat = each_distance_matrix * weights[idx]
    else:
        hist_dist_mat = np.sum([hist_dist_mat, each_distance_matrix * weights[idx]], axis=0)
    del each_distance_matrix
    gc.collect()

[0.32110426 0.02630462 0.7266391  ... 0.09179598 0.59808206 0.67061198]
[0.         0.         0.         ... 0.04931695 0.04931695 0.        ]
[0.17577947 0.10663291 0.42821022 ... 0.2586753  0.2586753  0.        ]
[0.11384739 0.07656987 0.36037072 ... 0.24880716 0.22703097 0.43901618]
[0.30412519 0.29689756 0.44052665 ... 0.46797386 0.3820038  0.65037642]
[0.65673232 0.64414187 0.78745939 ... 0.69200416 0.65425322 0.76307865]
[0.2189326  0.25761886 0.54726255 ... 0.18085451 0.48195524 0.55821913]
[0.30858726 0.30550598 0.72655756 ... 0.55198623 0.78528805 0.69115949]
[0.07468223 0.10663291 0.07468223 ... 0.03226217 0.03226217 0.        ]
[0.05780056 0.15466943 0.48424546 ... 0.5852847  0.68530015 0.79189789]
[0.10758877 0.06914959 0.10758877 ... 0.13305994 0.12591579 0.17637815]
[0.10758877 0.06914959 0.10758877 ... 0.15203749 0.13142379 0.14322511]
[0.49549307 0.5092181  0.70161271 ... 0.49438697 0.75834965 0.67061198]
[0.25468087 0.07946667 0.74024851 ... 0.06622751 0.56127941 0.50

In [38]:
hist_dist_mat                                                                                                                                                                                                                                                                                                                                                                                                       

array([ 7.11066234,  4.95312266, 11.57415155, ...,  9.48103102,
       11.7086109 , 12.95463409])

### Combining into final distance matrix
Combining the numerical, categorical and histogram distance matrices into one distance matrix D_3 (apply weights above)

In [39]:
c = hist_dist_mat.mean()
print((a, b, c))

(4.935663857783519, 1.35038143386032, 8.00606922742677)


In [40]:
dist_mat = np.sum([dist_mat, weight_his * a / c * hist_dist_mat], axis=0)

In [41]:
# mean values are the same
(weight_his * a / c * hist_dist_mat).mean()

4.935663857783545

In [42]:
del hist_dist_mat
gc.collect()

100

In [43]:
D_3 = dist_mat
del dist_mat
gc.collect()

20

### Save

In [44]:
import hickle as hkl 

dict_['df_all_2'] = df_all_2  # 40% of non scaled dataset
dict_['df_all'] = df_all  # 40% of scaled dataset
dict_['D_3'] = D_3 # a condensed distance matrix

fname = dirPData + '03a_distancematrix_40%_exclude_target.hkl'

with open(fname, 'w') as f:
    hkl.dump(dict_, f)

del dict_, fname, df_all

