In [26]:
import pandas as pd
import pickle
from scipy import spatial


def load_df(path):
    df = pd.read_csv(path)
    return df

def get_distance(point1, point2):
    return spatial.distance.cosine(point1, point2)

def to_floats(vector_str):
    return [float(s) for s in vector_str.split(" ")]
    

In [27]:
def load_run_data(run_index):
    data_path = "../pythonminer/out/intellij-community/generated_data_6x_1Ksamples/run_0{}/vectorization_4_packs_1000_samples.csv".format(str(run_index))
    print(data_path)
    df = load_df(data_path)
    
    df['vector'] = df['vector'].apply(to_floats)
    df.sort_values(by=['entity'], inplace=True)

    max_bucket = df["time_bucket"].max()
    
    dist_data = []
    
    entities_per_bucket = []
    
    for bucket_index in range(0, max_bucket+1):
        # print("bucket", bucket_index)
        cur_bucket = df[df["time_bucket"] == bucket_index]
        
        # print(cur_bucket.describe())
        
        entities = cur_bucket['entity'].unique().tolist()
        entities.sort()
        print("{} entities active in bucket {} of {}".format(len(entities), bucket_index+1, max_bucket+1))
        # print(entities)
        entities_per_bucket.append(entities)
        
        vector_per_entity = {}
        for index, r in cur_bucket.iterrows():
            entity, vec = r['entity'], r['vector']
            vector_per_entity[entity] = vec
        
        for e1 in entities:
            vec1 = vector_per_entity[e1]
            for e2 in entities:
                if e2 > e1:
                    vec2 = vector_per_entity[e2]
                    # print(e1, e2)
                    dist_data.append([bucket_index, (e1, e2), get_distance(vec1, vec2), vec1, vec2])
                    
        # for index, row in cur_bucket.iterrows():
        #     print(row['vector'])
        
    dist_data_names = ['bucket', 'entities', 'distance', 'vec1', 'vec2']
    dist_df = pd.DataFrame(dist_data, columns=dist_data_names)
    
    dist_df.sort_values(by=['entities'], inplace=True)
    
    # dist_df
    
    all_entity_pairs = dist_df['entities'].unique()
    # print(all_entity_pairs)
    def intersection_size(list1, list2):
        return len(set(list1) & set(list2))
    
    for i in range(1, len(entities_per_bucket)):
        ent1 = entities_per_bucket[i-1]
        ent2 = entities_per_bucket[i]
        # print("Entities intersection between buckets {} and {}: {}".format(i, i+1, intersection_size(ent1, ent2)))
    
    
    
    dist_trends = {ep : [] for ep in all_entity_pairs}
    avg_dist = []
    for bucket_index in range(0, max_bucket+1):
        # print("processing bucket ", bucket_index)
        dist_per_entitypair = {}
        cur_bucket_df = dist_df[dist_df["bucket"] == bucket_index]
        
        for index, r in cur_bucket_df.iterrows():
            dist_per_entitypair[r['entities']] = r['distance']
            
        for entity_pair in all_entity_pairs:
            if entity_pair in dist_per_entitypair:
                dist_trends[entity_pair].append(dist_per_entitypair[entity_pair])
            else:
                dist_trends[entity_pair].append(None)
            
                
        average_distance = cur_bucket_df['distance'].mean()
        avg_dist.append(average_distance)        
    
    avg_dist
    return dist_trends

In [28]:
n_runs = 6
n_buckets = 10

# [Each item is a dict: {(pair of entities) -> [distances, one per bucket]}]
trends = []
for i in range(n_runs):
    trends.append(load_run_data(i+1))
    
def get_diff(list_with_nans):
    diff = []
    for i in range(1, len(list_with_nans)):
        has_nans = list_with_nans[i-1] is None or list_with_nans[i] is None
        diff.append(None if has_nans else list_with_nans[i] - list_with_nans[i-1])
    return diff

diff_trends = []
for i in range(n_runs):
    diff_trends.append({p: get_diff(trends[i][p]) for p in trends[i].keys()})
    



../pythonminer/out/intellij-community/generated_data_6x_1Ksamples/run_01/vectorization_4_packs_1000_samples.csv
27 entities active in bucket 1 of 10
38 entities active in bucket 2 of 10
47 entities active in bucket 3 of 10
53 entities active in bucket 4 of 10
63 entities active in bucket 5 of 10
71 entities active in bucket 6 of 10
74 entities active in bucket 7 of 10
83 entities active in bucket 8 of 10
87 entities active in bucket 9 of 10
83 entities active in bucket 10 of 10
../pythonminer/out/intellij-community/generated_data_6x_1Ksamples/run_02/vectorization_4_packs_1000_samples.csv
27 entities active in bucket 1 of 10
38 entities active in bucket 2 of 10
47 entities active in bucket 3 of 10
53 entities active in bucket 4 of 10
63 entities active in bucket 5 of 10
71 entities active in bucket 6 of 10
74 entities active in bucket 7 of 10
83 entities active in bucket 8 of 10
87 entities active in bucket 9 of 10
83 entities active in bucket 10 of 10
../pythonminer/out/intellij-commun

In [29]:

def count_sign_match(diff_a, diff_b):
    match_count = 0
    mismatch_count = 0
    for i in range(len(diff_a)):
        if diff_a[i] is not None:
            match = (diff_a[i] * diff_b[i]) > 0
            if match:
                match_count+=1 
            else:
                mismatch_count += 1
    return {"match": match_count, "mismatch": mismatch_count}


def compare_diff_trends(trend_a, trend_b):
    match_count = 0
    mismatch_count = 0
    for pair in trend_a.keys():
        match_counts = count_sign_match(trend_a[pair], trend_b[pair])
        match_count+=match_counts["match"]
        mismatch_count+=match_counts["mismatch"]
    return {"match": match_count, "mismatch": mismatch_count}


red = pickle.load(open("../pythonminer/out/intellij-community/generated_data_6x_1Ksamples/reversed_entity_dict.pkl", 'rb'))

def get_name(entity):
    return red[entity]['names'][0]

In [30]:
def extract_trend_stats(pair, diff_trends):
    na_count = diff_trends[0].count(None)
    positive_counts = []
    negative_counts = []
    sums = []
    for i in range(len(diff_trends[0])):
        if (diff_trends[0][i]) is not None:
            pos_count = 0
            neg_count = 0
            sum = 0
            for trend_index in range(len(diff_trends)):
                d = diff_trends[trend_index][i]
                if d > 0:
                    pos_count+=1
                else: 
                    neg_count+=1
                sum+=d
            positive_counts.append(pos_count)
            negative_counts.append(neg_count)
            sums.append(sum)
        else: 
            positive_counts.append(None)
            negative_counts.append(None)
            sums.append(None)
    return {
        'entities': pair,
        'names': (get_name(pair[0]), get_name(pair[1])),
        'na_count': na_count,
        'negative_counts': negative_counts,
        'positive_counts': positive_counts,
        'sums': sums
    }            
            

def get_diff_stats(diff_trends_per_run):
    all_pairs = diff_trends_per_run[0].keys()
    diff_stats = []
    for p in all_pairs:
        diff_trends_for_pair = [diff_trends_per_run[i][p] for i in range(len(diff_trends_per_run))]
        stats_for_pair = extract_trend_stats(p, diff_trends_for_pair)
        if stats_for_pair['na_count'] < n_buckets - 1:
            diff_stats.append(stats_for_pair)
    return diff_stats
        

In [31]:
trend_stats = get_diff_stats(diff_trends)

def extract_bucket_stats(trend_stats, bucket_edge_index):
    entries = []
    for ts_row in trend_stats:
        dist_diff = ts_row['sums'][bucket_edge_index]
        if dist_diff is not None:
            entries.append({
                'entities': ts_row['entities'],
                'names': ts_row['names'],
                'distance_diff': dist_diff
            })
    entries = sorted(entries, key = lambda e: e['distance_diff'])
    
    bucket_stats = {
        'bucket1': bucket_edge_index + 1,
        'bucket2': bucket_edge_index + 2,
        'entries': entries
    }
    return bucket_stats

for bucket in range(n_buckets-1):
    bs = extract_bucket_stats(trend_stats, bucket)
    print("Most prominent relative distance changes between buckets {} and {}, summed across {} runs".format(bs['bucket1'], bs['bucket2'], n_runs))
    for ei in range(50):
        entry = bs['entries'][ei]
        print("{}(entity {}) and {}({}): {}".format(
            entry['names'][0],
            entry['entities'][0],
            entry['names'][1],
            entry['entities'][1],
            entry['distance_diff']
        ))
    print("\n")
    
    

Most prominent relative distance changes between buckets 1 and 2, summed across 6 runs
Sascha Weinreuter(entity 66.0) and Anton.Makeev(241.0): -1.3043667025002124
Ilya Sergey(entity 88.0) and Anton.Makeev(241.0): -0.8315174777679042
Sascha Weinreuter(entity 66.0) and cdr(182.0): -0.8010693888987315
Sascha Weinreuter(entity 66.0) and Nikolay Chashnikov(262.0): -0.7876153008810901
Dmitry Krasilschikov(entity 239.0) and Anton.Makeev(241.0): -0.7720220049537126
Sascha Weinreuter(entity 66.0) and Bas Leijdekkers(378.0): -0.7647479867993368
Ilya Sergey(entity 88.0) and Bas Leijdekkers(378.0): -0.7180392284833962
anna(entity 19.0) and Ilya Sergey(88.0): -0.7123809528900423
Sascha Weinreuter(entity 66.0) and peter.gromov(276.0): -0.6966448908488599
Sascha Weinreuter(entity 66.0) and animaru(403.0): -0.6443699427317698
Ilya Sergey(entity 88.0) and peter.gromov(276.0): -0.622071167811265
Ilya Sergey(entity 88.0) and Mike Aizatsky(503.0): -0.6181192518403197
anna(entity 19.0) and Dmitry Krasilsch