In [13]:
import collections
import copy
import itertools
import json
import datetime
import random
import time


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm

In [7]:
with open("data/results/all__post_stats.json") as handle:
    post_stats = json.load(handle)
sorted_post_stats = sorted(post_stats, key=lambda item:  datetime.datetime.strptime(item['date'], "%Y-%m-%dT%H:%M:%S.%f"))

with open("data/results/ip__libs_first_dates.json") as handle:
    libs_first_dates = json.load(handle)

In [46]:
libs_already_available = []
randomized_post_stats = []

pairs_first_dates = {}
pairs_dates = collections.defaultdict(list)

for stat in tqdm.tqdm(sorted_post_stats):
    randomized_stat = copy.deepcopy(stat)
    imports = stat["imports"]
    randomized_stat["imports"] = []

    vintages = []
    for imp in imports:
        if libs_first_dates[imp]["id"] == stat["id"]:
            vintages.append(imp)
            randomized_stat["imports"].append(imp)
        else:
            randomized_stat["imports"].append(random.choice(libs_already_available))
    libs_already_available.extend(vintages)
    randomized_post_stats.append(randomized_stat)

    # Artificially generate pairs_first_dates
    for imp1, imp2 in itertools.combinations(randomized_stat["imports"], 2):
        canonical = "|".join(sorted([imp1, imp2]))
        pairs_dates[canonical].append(stat["date"])
        if canonical in pairs_first_dates:
            continue
        pairs_first_dates[canonical] = {
            "id": stat["id"],
            "poster_id": stat["poster_id"],
            "date": stat["date"][:10],  # TODO: this is a hack, get the date properly
        }


100%|██████████████████████████████| 1469300/1469300 [00:29<00:00, 50390.30it/s]


In [47]:
for i in range(10):
    print(sorted_post_stats[i]["imports"], " <-> ", randomized_post_stats[i]["imports"])

['MySQLdb', 'cgi', 'cgitb']  <->  ['MySQLdb', 'cgi', 'cgitb']
['sys']  <->  ['sys']
['select', 'cgi', 'termios', 'tty']  <->  ['select', 'cgi', 'termios', 'tty']
['msvcrt']  <->  ['msvcrt']
['MySQLdb']  <->  ['MySQLdb']
['ftplib']  <->  ['ftplib']
['foo']  <->  ['foo']
['termios']  <->  ['ftplib']
['serial']  <->  ['serial']
['re', 'urllib']  <->  ['re', 'urllib']


In [48]:
len(sorted_post_stats), len(randomized_post_stats)

(1469300, 1469300)

In [49]:
sorted_post_stats = randomized_post_stats  # HACK to reuse the original code

In [50]:
VALUE_THRESHOLDS = [5, 10, 25]#[25, 50, 100, 200]
VALUE_TIME_WINDOW = datetime.timedelta(days=365)
BIN_BORDERS = [1, 10, 25, 50, 100, 200, np.infty]  


user_posts = collections.defaultdict(set)
novel_post_stats = []
post_rank_count = collections.defaultdict(int)
novel_post_rank_count = collections.defaultdict(int)
valuable_novel_post_rank_count = {
    k: collections.defaultdict(int) for k in VALUE_THRESHOLDS
}
stats = collections.defaultdict(int)
for post_stat in tqdm.tqdm(sorted_post_stats):
    try:
        user_posts[post_stat["poster_id"]].add(post_stat["id"])
        post_rank = len(user_posts[post_stat["poster_id"]])
        novelty_count = 0
        is_valuable = {k: False for k in VALUE_THRESHOLDS}
        for imp1, imp2 in itertools.combinations(post_stat["imports"], 2):
            canonical_pair_name = "|".join(sorted([imp1, imp2]))
            if pairs_first_dates[canonical_pair_name]["id"] == post_stat["id"]:
                novelty_count += 1
                for value_threshold in VALUE_THRESHOLDS:
                    # Decide if the pair in the post received enough traction within the prescribed time window.
                    relevant_dates = [
                        dt for dt in pairs_dates[canonical_pair_name]if (
                            datetime.datetime.strptime(dt, "%Y-%m-%d") <
                            datetime.datetime.strptime(pairs_first_dates[canonical_pair_name]["date"], "%Y-%m-%d") + VALUE_TIME_WINDOW
                        )
                    ]
                    if len(relevant_dates) >= value_threshold:
                        is_valuable[value_threshold] = True
                novel_post_stats.append(
                    {
                        "poster_id": post_stat["poster_id"],
                        "id": post_stat["id"],
                        "post_rank": len(user_posts[post_stat["poster_id"]]),
                        "canonical_pair_name": canonical_pair_name,
                        "pair_total_use_within_window": len(relevant_dates),
                        "date": datetime.datetime.strptime(post_stat["date"], "%Y-%m-%dT%H:%M:%S.%f"),
                    }
                )
        if novelty_count > 0:
            stats["novelty_post_count"] += 1
            novel_post_rank_count[post_rank] += 1

            # Decide if the post is valuable
            for value_threshold in VALUE_THRESHOLDS:
                if is_valuable[value_threshold]:
                    valuable_novel_post_rank_count[value_threshold][post_rank] += 1
        else:
            stats["non_novel_post_count"] += 1
        stats["total_post_count"] += 1
        post_rank_count[post_rank] += 1
    except Exception as exc:
        stats[exc] += 1

print("STATS:")
for k, v in stats.items():
    print(f"\t{k}: {v}")

# Calculate it for the richest domain of the valuable ratios, that is, for the smallest threshold.
novel_ratio = {
    k: novel_post_rank_count.get(k, 0) / post_rank_count.get(k, 1)
    for k in sorted(valuable_novel_post_rank_count[min(VALUE_THRESHOLDS)].keys())
}

valuable_novel_ratio = {}
for value_threshold in VALUE_THRESHOLDS:
    valuable_novel_ratio[value_threshold] = {
        k: valuable_novel_post_rank_count[value_threshold].get(k, 0) / post_rank_count.get(k, 1)
        for k in sorted(valuable_novel_post_rank_count[value_threshold].keys())
    }

100%|██████████████████████████████| 1469300/1469300 [00:17<00:00, 82857.87it/s]


STATS:
	unconverted data remains: T20:07:05.290: 1
	non_novel_post_count: 800032
	total_post_count: 800032
	unconverted data remains: T02:54:54.577: 1
	unconverted data remains: T13:35:19.970: 1
	unconverted data remains: T17:58:28.733: 1
	unconverted data remains: T09:39:56.927: 1
	unconverted data remains: T15:38:22.330: 1
	unconverted data remains: T09:39:08.247: 1
	unconverted data remains: T16:08:37.920: 1
	unconverted data remains: T16:52:29.453: 1
	unconverted data remains: T17:43:06.413: 1
	unconverted data remains: T23:16:16.610: 1
	unconverted data remains: T00:35:29.807: 1
	unconverted data remains: T14:39:04.763: 1
	unconverted data remains: T18:02:33.027: 1
	unconverted data remains: T22:52:47.723: 1
	unconverted data remains: T01:20:52.157: 1
	unconverted data remains: T12:03:20.513: 1
	unconverted data remains: T06:55:13.317: 1
	unconverted data remains: T15:20:52.277: 1
	unconverted data remains: T17:40:47.417: 1
	unconverted data remains: T06:10:58.127: 1
	unconverted 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

