In [2]:
import pandas as pd
import pickle
from scipy import spatial


def load_df(path):
    df = pd.read_csv(path)
    return df

def get_distance(point1, point2):
    return spatial.distance.cosine(point1, point2)

def to_floats(vector_str):
    return [float(s) for s in vector_str.split(" ")]
    

In [3]:
def load_run_data(run_index):
    df = load_df("../pythonminer/out/intellij-community/generated_data/run_0{}/vectorization_4_packs_4_samples.csv".format(str(run_index)))
    df['vector'] = df['vector'].apply(to_floats)
    df.sort_values(by=['entity'], inplace=True)
    
    

    max_bucket = df["time_bucket"].max()
    
    dist_data = []
    
    for bucket_index in range(0, max_bucket+1):
        # print("bucket", bucket_index)
        cur_bucket = df[df["time_bucket"] == bucket_index]
        
        # print(cur_bucket.describe())
        
        entities = cur_bucket['entity'].unique().tolist()
        entities.sort()
        print(len(entities))
        # print(entities)
        
        vector_per_entity = {}
        for index, r in cur_bucket.iterrows():
            entity, vec = r['entity'], r['vector']
            vector_per_entity[entity] = vec
        
        for e1 in entities:
            vec1 = vector_per_entity[e1]
            for e2 in entities:
                if e2 > e1:
                    vec2 = vector_per_entity[e2]
                    # print(e1, e2)
                    dist_data.append([bucket_index, (e1, e2), get_distance(vec1, vec2), vec1, vec2])
                    
        # for index, row in cur_bucket.iterrows():
        #     print(row['vector'])
        
    dist_data_names = ['bucket', 'entities', 'distance', 'vec1', 'vec2']
    dist_df = pd.DataFrame(dist_data, columns=dist_data_names)
    
    dist_df.sort_values(by=['entities'], inplace=True)
    
    # dist_df
    
    all_entity_pairs = dist_df['entities'].unique()
    print(all_entity_pairs)
    
    
    dist_trends = {ep : [] for ep in all_entity_pairs}
    avg_dist = []
    for bucket_index in range(0, max_bucket+1):
        print("processing bucket ", bucket_index)
        dist_per_entitypair = {}
        cur_bucket_df = dist_df[dist_df["bucket"] == bucket_index]
        
        for index, r in cur_bucket_df.iterrows():
            dist_per_entitypair[r['entities']] = r['distance']
            
        for entity_pair in all_entity_pairs:
            if entity_pair in dist_per_entitypair:
                dist_trends[entity_pair].append(dist_per_entitypair[entity_pair])
            else:
                dist_trends[entity_pair].append(None)
            
                
        average_distance = cur_bucket_df['distance'].mean()
        avg_dist.append(average_distance)
                
    
    avg_dist
    return dist_trends

In [4]:
trends_01 = load_run_data(1)
trends_02 = load_run_data(2)
trends_03 = load_run_data(3)

298
293
302
304
311
327
306
331
325
323
[(1, 2) (1, 3) (1, 4) ... (1125, 1128) (1125, 1137) (1128, 1137)]
processing bucket  0
processing bucket  1
processing bucket  2
processing bucket  3
processing bucket  4
processing bucket  5
processing bucket  6
processing bucket  7
processing bucket  8
processing bucket  9
298
293
302
304
311
327
306
331
325
323
[(1, 2) (1, 3) (1, 4) ... (1125, 1128) (1125, 1137) (1128, 1137)]
processing bucket  0
processing bucket  1
processing bucket  2
processing bucket  3
processing bucket  4
processing bucket  5
processing bucket  6
processing bucket  7
processing bucket  8
processing bucket  9
298
293
302
304
311
327
306
331
325
323
[(1, 2) (1, 3) (1, 4) ... (1125, 1128) (1125, 1137) (1128, 1137)]
processing bucket  0
processing bucket  1
processing bucket  2
processing bucket  3
processing bucket  4
processing bucket  5
processing bucket  6
processing bucket  7
processing bucket  8
processing bucket  9


In [5]:
# len(trends_01.keys())
# len(trends_02.keys())
# len(trends_03.keys())

trends_summary = {pair : [trends_01[pair], trends_02[pair], trends_03[pair]] for pair in trends_01.keys()}
trends_summary

{(1,
  2): [[0.260188169981885,
   0.5257789977032736,
   None,
   0.6263422785817252,
   None,
   None,
   None,
   0.010533414547182707,
   None,
   None], [0.14277841700529637,
   0.3651872907563751,
   None,
   0.071799269998019,
   None,
   None,
   None,
   0.023239294558869084,
   None,
   None], [0.15638311020265894,
   0.7453684853418923,
   None,
   0.46643087374849346,
   None,
   None,
   None,
   0.09425311778289358,
   None,
   None]],
 (1,
  3): [[None,
   0.6205624662672422,
   None,
   None,
   None,
   None,
   None,
   0.0034307696065706272,
   None,
   None], [None,
   0.5834310334871899,
   None,
   None,
   None,
   None,
   None,
   0.8653213472378698,
   None,
   None], [None,
   0.17211664300298057,
   None,
   None,
   None,
   None,
   None,
   0.24797992704280092,
   None,
   None]],
 (1,
  4): [[0.2076569884066647,
   0.5171326044184412,
   None,
   0.5222711095175532,
   None,
   None,
   None,
   0.004192495709120725,
   None,
   None], [0.178110181700615

In [14]:
def get_diff(list_with_nans):
    diff = []
    for i in range(1, len(list_with_nans)):
        has_nans = list_with_nans[i-1] is None or list_with_nans[i] is None
        diff.append(None if has_nans else list_with_nans[i] - list_with_nans[i-1])
    return diff
        
trend_diffs = {p: [get_diff(trends_summary[p][0]), get_diff(trends_summary[p][1]), get_diff(trends_summary[p][2])] for p in trends_summary.keys()}

trend_diffs

diff_1 = {p: trend_diffs[p][0] for p in trend_diffs.keys()}
diff_2 = {p: trend_diffs[p][1] for p in trend_diffs.keys()}
diff_3 = {p: trend_diffs[p][2] for p in trend_diffs.keys()}

def count_sign_match(diff_a, diff_b):
    match_count = 0
    mismatch_count = 0
    for i in range(len(diff_a)):
        if diff_a[i] is not None:
            match = (diff_a[i] * diff_b[i]) > 0
            if match:
                match_count+=1 
            else:
                mismatch_count += 1
    return {"match": match_count, "mismatch": mismatch_count}

def compare_diff_trends(trend_a, trend_b):
    match_count = 0
    mismatch_count = 0
    for pair in trend_a.keys():
        match_counts = count_sign_match(trend_a[pair], trend_b[pair])
        match_count+=match_counts["match"]
        mismatch_count+=match_counts["mismatch"]
    return {"match": match_count, "mismatch": mismatch_count}

print(compare_diff_trends(diff_1, diff_2))
print(compare_diff_trends(diff_2, diff_3))
print(compare_diff_trends(diff_1, diff_3))


        

{'match': 170435, 'mismatch': 151913}
{'match': 209200, 'mismatch': 113148}
{'match': 178985, 'mismatch': 143363}
