In [6]:
import pandas as pd
import pickle
from scipy import spatial


def load_df(path):
    df = pd.read_csv(path)
    return df

def get_distance(point1, point2):
    return spatial.distance.cosine(point1, point2)

def to_floats(vector_str):
    return [float(s) for s in vector_str.split(" ")]
    

In [7]:
df = load_df("../pythonminer/out/intellij-community/generated_data/vectorization_4_packs_4_samples.csv")
df['vector'] = df['vector'].apply(to_floats)
df.sort_values(by=['entity'], inplace=True)

print(df.describe())

            entity  time_bucket
count  4818.000000  4818.000000
mean    389.778539     4.561021
std     260.498384     2.859181
min       0.000000     0.000000
25%     145.000000     2.000000
50%     385.500000     5.000000
75%     640.000000     7.000000
max     827.000000     9.000000


In [8]:
max_bucket = df["time_bucket"].max()

dist_data = []

for bucket_index in range(0, max_bucket+1):
    # print("bucket", bucket_index)
    cur_bucket = df[df["time_bucket"] == bucket_index]
    
    # print(cur_bucket.describe())
    
    entities = cur_bucket['entity'].unique().tolist()
    entities.sort()
    print(len(entities))
    # print(entities)
    
    vector_per_entity = {}
    for index, r in cur_bucket.iterrows():
        entity, vec = r['entity'], r['vector']
        vector_per_entity[entity] = vec
    
    for e1 in entities:
        vec1 = vector_per_entity[e1]
        for e2 in entities:
            if e2 > e1:
                vec2 = vector_per_entity[e2]
                # print(e1, e2)
                dist_data.append([bucket_index, (e1, e2), get_distance(vec1, vec2), vec1, vec2])
                
    # for index, row in cur_bucket.iterrows():
    #     print(row['vector'])
    
dist_data_names = ['bucket', 'entities', 'distance', 'vec1', 'vec2']
dist_df = pd.DataFrame(dist_data, columns=dist_data_names)

dist_df.sort_values(by=['entities'], inplace=True)

dist_df

all_entity_pairs = dist_df['entities'].unique()
print(all_entity_pairs)

457
476
465
474
488
491
496
495
485
491
[(0, 1) (0, 2) (0, 3) ... (824, 826) (824, 827) (826, 827)]


In [9]:
dist_trends = {ep : [] for ep in all_entity_pairs}
avg_dist = []
for bucket_index in range(0, max_bucket+1):
    print("processing bucket ", bucket_index)
    dist_per_entitypair = {}
    cur_bucket_df = dist_df[dist_df["bucket"] == bucket_index]
    
    for index, r in cur_bucket_df.iterrows():
        dist_per_entitypair[r['entities']] = r['distance']
        
    for entity_pair in all_entity_pairs:
        if entity_pair in dist_per_entitypair:
            dist_trends[entity_pair].append(dist_per_entitypair[entity_pair])
        else:
            dist_trends[entity_pair].append(None)
        
            
    average_distance = cur_bucket_df['distance'].mean()
    avg_dist.append(average_distance)
            

avg_dist

processing bucket  0
processing bucket  1
processing bucket  2
processing bucket  3
processing bucket  4
processing bucket  5
processing bucket  6
processing bucket  7
processing bucket  8
processing bucket  9


[0.2024593307507462,
 0.1957280218324127,
 0.18382959179458003,
 0.24629094642812627,
 0.2698225082517665,
 0.24331426072771842,
 0.23900700740813224,
 0.26667742887135903,
 0.26547714883267803,
 0.24190131132733617]

In [10]:
dist_trends

{(0, 1): [0.020243671180377287,
  0.00263595224501878,
  0.012254308681454162,
  0.026617761969194076,
  0.02676927777819782,
  0.05034835676481342,
  0.03337446827807922,
  0.07474341222765069,
  0.016121806364665203,
  0.051007238856367376],
 (0, 2): [0.07677701730924413,
  0.08120677727558201,
  0.014087324568914927,
  0.02633667227906189,
  0.05241123392482916,
  0.0938331131175707,
  0.018662964649756697,
  0.10624933287885974,
  0.04332224033991039,
  0.09162797951525137],
 (0, 3): [0.021599228941560633,
  0.007547687311835416,
  0.009509337692273911,
  0.04524123267063862,
  0.007278023693995328,
  0.03446392361347694,
  0.01116750120628529,
  0.07063409731214099,
  0.038000018751234044,
  0.11808177228732886],
 (0, 4): [0.03230453695571922,
  0.020205273249315425,
  0.009074758172905817,
  0.024648647508436983,
  0.025039289368104733,
  0.021144986164714052,
  0.014363915137906025,
  0.13157780240007877,
  0.09052235750904025,
  0.04085091891996606],
 (0, 5): [0.022205870706512

In [11]:
# with open("../pythonminer/out/intellij-community/generated_data/entity_dict.pkl", 'rb') as f:
#     entities_dict = pickle.load(f)
#     