<a href="https://colab.research.google.com/github/Hackathorn/CVA-SBERT/blob/main/notebooks/CVA-SBERT-SDIclustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analyzes cluster similarity within Source-Definition-Item hierarchy

1. Setup analysis environment and load dataframes
1. Create working df as 3-level grouping using only Target=1
1. Compute counts/mean/var aggregations for top levels
1. Analyze those cluster similarity aggregations

# Setup Environment

## Set Notebook Parameters

In [1]:
# use gDrive if you previously saved train_data, etc.
# otherwise, use pre-generated data from repos (Default)
USE_GDRIVE = False

# save analysis plots if customized
SAVE_PLOT = False

## Import various packages


In [2]:
import pandas as pd
import numpy as np

import os.path
from os import path
from time import strftime, localtime
from google.colab import drive

## Clone CVA-SBERT GitHub or mount gDrive

In [3]:
if USE_GDRIVE:
    drive.mount('/content/drive')               # mount YOUR gDrive

    # Path to data -- change for YOUR specific Analysis folder
    path = '/content/drive/MyDrive/CVA-SBERT/Analysis-20221203-190207' ### CHANGE!!!

else:
    !git clone https://github.com/Hackathorn/CVA-SBERT  # clone repos

    # Path to data in repository
    path = '/content/CVA-SBERT/data/SetUp_Data'

path

Cloning into 'CVA-SBERT'...
remote: Enumerating objects: 391, done.[K
remote: Counting objects: 100% (229/229), done.[K
remote: Compressing objects: 100% (108/108), done.[K
remote: Total 391 (delta 150), reused 188 (delta 121), pack-reused 162[K
Receiving objects: 100% (391/391), 115.89 MiB | 13.98 MiB/s, done.
Resolving deltas: 100% (251/251), done.
Checking out files: 100% (22/22), done.


'/content/CVA-SBERT/data/SetUp_Data'

Load dataframes and create working df as simply ```df```

In [4]:
# load previous dataframes from SetUp notebook
CVA_df = pd.read_pickle(path + '/CVA_df.pkl')
token_df = pd.read_pickle(path + '/token_df.pkl')

# use only 'good' data
df = CVA_df[CVA_df.Target == 1]
# remove unneeded columns
df = df.drop(columns = ['Target', 'Definition', 'Item', 'is_train'])
# rename columns to short consist names
df.rename(columns={"Source": "S", "Def_token": "D", "Item_token": 'I'}, inplace=True)
df.rename(columns={"Cos_Sim": "Csim", "Euc_Sim": "Esim"}, inplace=True)

df

Unnamed: 0,Index,S,D,I,Csim,Esim
0,0,2978,7060,2240,0.185577,1.276263
6,6,3169,5361,5119,0.414065,1.082529
8,8,2367,9846,4760,0.253170,1.222154
13,13,12426,9358,7035,0.488197,1.011734
18,18,13903,7165,4199,0.240013,1.232872
...,...,...,...,...,...,...
28069,28069,1915,2576,2574,0.624945,0.866089
28071,28071,12822,2404,11294,0.169479,1.288814
28072,28072,3350,6839,8420,0.583409,0.912789
28074,28074,2361,6453,10551,0.383094,1.110771


RESULTS...
- Note half of the rows (approx) for eliminating Target=0 Def-Items
- Nice and compact

# Group as 3-level S-D-I hierarchy

## Next piece of this step

In [22]:
# group Source/Definition items into cluster
SD_clus = df.groupby(['S', 'D'], as_index=False)
# SD_clus['I'].count()
SD_clus['I'].size()

Unnamed: 0,S,D,size
0,317,1976,18
1,317,7325,18
2,353,9148,1
3,354,130,4
4,354,6809,20
...,...,...,...
2892,14032,10876,3
2893,14038,6622,4
2894,14038,8978,3
2895,14046,8673,2


In [15]:
# group Source/Definition items into cluster
SD_clus = df.groupby(['S', 'D'], as_index=False)
# SD_clus['Csim', 'Esim'].mean()
SD_clus['Csim', 'Esim'].describe(percentiles=[])

  SD_clus['Csim', 'Esim'].describe(percentiles=[])


Unnamed: 0_level_0,Csim,Csim,Csim,Csim,Csim,Csim,Esim,Esim,Esim,Esim,Esim,Esim
Unnamed: 0_level_1,count,mean,std,min,50%,max,count,mean,std,min,50%,max
0,18.0,0.246400,0.115493,0.098793,0.222941,0.522491,18.0,1.223931,0.098666,0.977250,1.246642,1.342540
1,18.0,0.324193,0.109385,0.150702,0.323953,0.572357,18.0,1.158767,0.096931,0.924817,1.162717,1.303302
2,1.0,0.322452,,0.322452,0.322452,0.322452,1.0,1.164086,,1.164086,1.164086,1.164086
3,4.0,0.076997,0.105993,-0.014949,0.046689,0.229559,4.0,1.356913,0.079947,1.241323,1.380792,1.424745
4,20.0,0.183167,0.075195,0.065572,0.187732,0.316475,20.0,1.276854,0.059032,1.169209,1.274555,1.367061
...,...,...,...,...,...,...,...,...,...,...,...,...
2892,3.0,0.156425,0.142639,0.009886,0.164572,0.294816,3.0,1.295804,0.109843,1.187589,1.292616,1.407205
2893,4.0,0.578196,0.083418,0.475173,0.581915,0.673783,4.0,0.915098,0.090952,0.807734,0.914065,1.024527
2894,3.0,0.658978,0.090310,0.590200,0.625485,0.761249,3.0,0.820599,0.113979,0.691015,0.865465,0.905318
2895,2.0,0.305916,0.104846,0.231778,0.305916,0.380053,2.0,1.176519,0.089116,1.113505,1.176519,1.239534


In [10]:
# group Source/Definition items into cluster
SD_clus = df.groupby(['S', 'D'], as_index=False)
SD_clus['Csim', 'Esim'].mean()

  SD_clus['Csim', 'Esim'].mean()


Unnamed: 0,S,D,Csim,Esim
0,317,1976,0.246400,1.223931
1,317,7325,0.324193,1.158767
2,353,9148,0.322452,1.164086
3,354,130,0.076997,1.356913
4,354,6809,0.183167,1.276854
...,...,...,...,...
2892,14032,10876,0.156425,1.295804
2893,14038,6622,0.578196,0.915098
2894,14038,8978,0.658978,0.820599
2895,14046,8673,0.305916,1.176519


RESULTS...
- Some insights........

# Save analysis results to your gDrive - OPTIONAL

Mount gDrive and create timestamped Experiment Folder

In [None]:
drive.mount('/content/drive')   # ignore warning if already mounted

BASE_PATH = '/content/drive/MyDrive/CVA-SBERT/'
EXP_PATH = BASE_PATH + 'Analysis-' + strftime("%Y%m%d-%H%M%S", localtime())

if path.exists(BASE_PATH) == False:
    os.mkdir(BASE_PATH)
if path.exists(EXP_PATH) == False:
    os.mkdir(EXP_PATH)

Save dataframes or other results to Experiment Folder

In [None]:
# save initial two dataframes
CVA_df.to_pickle(EXP_PATH + '/CVA_df.pkl')
token_df.to_pickle(EXP_PATH + '/token_df.pkl')

# ...or other saving of other results, like plots
#