In [1]:
import hashlib
import re
import requests
import json
from datetime import date
import datetime
from dateutil.relativedelta import relativedelta
from html.parser import HTMLParser
from os import path
import matplotlib.pyplot as plt
import csv
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
MODE = os.getenv("MODE")  # no graphical output iff MODE=text
# MODE = "haha"
url = os.getenv("URL")

In [3]:
class MyHTMLParser(HTMLParser):
    js_encountered = False

    def handle_starttag(self, tag, attrs):
#         if tag == "a":
        
        print("Encountered a start tag:", tag, attrs)

    def handle_endtag(self, tag):
        print("Encountered an end tag :", tag)

    def handle_data(self, data):
        print("Encountered some data  :", data)

In [4]:
def filter_by_date(points):
    timediff = date.today() - relativedelta(days=14)
    timediff = datetime.datetime(timediff.year, timediff.month, timediff.day)
    
#     ans = []
#     for p in points:
#         if p[2] > timediff:
#             ans.append(p)
#             print(p[2])
#     return ans
    return [p for p in points if p[2] > timediff]


In [5]:
commit2state = {}
for page in range(130):
    # fetch Buildkite data
    buildkite_url = "https://buildkite.com/llvm-project/llvm-main/builds?page={}".format(page)
    buildkite_filename = str(int(hashlib.sha256(buildkite_url.encode('utf-8')).hexdigest()[:16],
                      16)-2**63)

    if not path.exists("../../data/" + buildkite_filename):
        payload = {}

        response = requests.request("GET", buildkite_url, data=payload)

        with open("../../data/" + buildkite_filename, "w") as f:
            f.write(response.text)

    with open("../../data/" + buildkite_filename, "r") as f:
        with open("../../data/buildkite_history.txt", "a") as f2:
            for line in f:
                if "var store = new BuildStore({\"id\":" in line:
                    line = line.strip()
                    line = line.replace("var store = new BuildStore(", "[")
                    line = line[:-1] + "]"
                    buildkite_data = json.loads(line)[1]

                    for build in buildkite_data:
                        commit2state[build['commit_id']] = build['state']
                        f2.write("{} {}\n".format(build['commit_id'], build['state']))
                    break

In [6]:
def remove_noise(points):
    # remove noise using Inter Quatile Range
    # reason: we can't gurantee datasets for all benchmarks are normal, so we can't use z-score/normal distribution.
    #         IQR is the most stable algorithm I can think of
    Q1 = int(len(points)*0.1)  # quatile chosen to be 0.1 because I don't want to lose too much of data
    Q3 = int(len(points)*0.9)
    return points[Q1:Q3]

In [7]:
filename = str(int(hashlib.sha256(url.encode('utf-8')).hexdigest()[:16],
                  16)-2**63)
# print("cached filename:", filename)

# fetch lnt data
if not path.exists("../../data/" + filename):
    payload = {}

    response = requests.request("GET", url, data=payload)

    with open("../../data/" + filename, "w") as f:
        f.write(response.text)

with open("../../data/" + filename, "r") as f:
    # parser = MyHTMLParser()
    # parser.feed(f.read())
    for line in f:
        if "overview_plots" in line:
            line = line.strip()
#             print(line)
            values = re.findall(r'var.*?=\s*(.*?);', line, re.DOTALL |
                                re.MULTILINE)
#             print((values[0][:10]))
            points = json.loads(values[0])
            points = points[0]["data"]  # a list of data points
            
            # convert string dates to manipulatable datetimes
            temp_points = []

            for p in points:
                if commit2state.get(p[2]["label"], "failed") == "passed":
                    p[2]["date"] = datetime.datetime.strptime(p[2]["date"], '%Y-%m-%d %H:%M:%S')
                    p.insert(2, p[2]["date"])
                    del p[3]["date"]
                    temp_points.append(p)
                    
            points = temp_points
            points.sort(key=lambda p: p[1])
            times = [p[1] for p in points]
   
            if MODE != "text":
                fig = plt.figure(figsize=(20,12))
                plt.hist(times, bins=15)
                plt.xlabel('frequency')
                plt.ylabel('execution time')
                plt.grid(True)
                plt.show()
            
            points = remove_noise(points)
            
            with open("../../data/" + filename + ".csv", 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerows([["sequence", "execution time", "date", "metadata"]] + points)

#             points_df = pd.DataFrame(points,columns=["sequence", "execution time", "date", "metadata"])  # points_df is a dataframe
            
#             points_filtered = filter_by_date(points)
            points_filtered = points
            if MODE != "text":
#                 display(points_df)
            
                print(points_filtered[0])
                print(points_filtered[1])
                print(points_filtered[2])
                print("...", len(points_filtered), "points found ...")
            break


In [8]:
def binary_search(arr, x):
    low = 0
    high = len(arr) - 1
    mid = 0
 
    while low <= high:
 
        mid = (high + low) // 2
 
        # If x is greater, ignore left half
        if arr[mid][0] < x:
            low = mid + 1
 
        # If x is smaller, ignore right half
        elif arr[mid][0] > x:
            high = mid - 1
 
        # means x is present at mid
        else:
            return mid
 
    # If we reach here, then the element was not present
    return -1


def plot_seq():
    # points sort by sequence number
    fig = plt.figure(figsize=(20,12))
    points_filtered.sort(key=lambda p: p[0])
    xs = [p[0] for p in points_filtered]
    ys = [p[1] for p in points_filtered]

    
    max_diff = float("-inf")
    last = None
    for i in range(len(ys)):
        if not last:
            last = ys[i]
        else:
            if abs(last - ys[i]) > max_diff:
                max_diff = abs(last - ys[i])

    last = None

    if MODE != "text":
        
        plt.plot(xs, ys)
        plt.xlabel('sequence')
        plt.ylabel('execution time')
        plt.grid(True)
        plt.show()

    for i in range(len(ys)):
        if not last:
            last = ys[i]
        else:
            if abs(last - ys[i]) > 0.8 * max_diff:
                print("{}, {}".format(points_filtered[binary_search(points_filtered, xs[i - 1])][3]["label"], points_filtered[binary_search(points_filtered, xs[i])][3]["label"]))

plot_seq()

00dc97f16708aad67834552285c0af01b37303d6, 00bfde723b643b4b79b2cf4193fb65ef0b9d47ee
1c6d1e57c15b59114a05b20e667517872510aaa9, dc1e7b73b81811b74a4fbde4c5cfab5a3ea7f820
1ac98044dfc250a0454762eb49f138b1f3f035e7, 1e6630311c4634bcaa2c5b091a9c31874f3309a3
c3f95e9197643b699b891ca416ce7d72cf89f5fc, f3c577ed38e55dca46692313f5b76688a115861a
bdd4dda58b0de08bd3474fb8d4589a9ba9349e88, 5f3c99085d4c2ebf57fd0586b013b02e32a8e20b
df6fb4d392e530fbf9d4e331711c500d47980dcc, 06d06f2f6403066415df7b8854e6aff7586a92df
6d31ee1cea7554fec9f3be6c4a40fc10e1595879, 6c097f73ca032e73b2eb4ec21ee9d0773c86d4ed
d7b0c19823892b2c94a9e347dec880a3531980ff, 7c5222e4d1a3a14f029e5f614c9aefd0fa505f1e
d8a4a2cb93212c493b7b47663c1cda1103de0bfc, dbf8cc7b6625ed3547e49b2faecdde3337b1ea6d
8ab8b3fad7a6e08452e30aaa3a75d6ec89ca5bf2, 8b9df70bf7e7b812715a3dc9772719188e0df06c
551a697c5cf33275b66add4fc467fcf59084cffb, 584e9b6e4b4987b882719923e640eed854613d91
584e9b6e4b4987b882719923e640eed854613d91, dddd590fd0bcc7e3f0cc9769fe3bad8556aa5c8a
d292

<Figure size 1440x864 with 0 Axes>

In [9]:
# Iterative Binary Search Function
# It returns index of x in given array arr if present,
# else returns -1
def binary_search(arr, x):
    low = 0
    high = len(arr) - 1
    mid = 0
 
    while low <= high:
 
        mid = (high + low) // 2
 
        # If x is greater, ignore left half
        if arr[mid][2] < x:
            low = mid + 1
 
        # If x is smaller, ignore right half
        elif arr[mid][2] > x:
            high = mid - 1
 
        # means x is present at mid
        else:
            return mid
 
    # If we reach here, then the element was not present
    return -1

def plot_date():
    # points sort by dates
    points_filtered.sort(key=lambda p: p[2])
    dates = [p[2] for p in points_filtered]
    ys = [p[1] for p in points_filtered]
#     print(dates, ys)

    max_diff = float("-inf")
    last = None
    for i in range(len(ys)):
        if not last:
            last = ys[i]
        else:
            if abs(last - ys[i]) > max_diff:
                max_diff = abs(last - ys[i])

    if MODE != "text":
        fig = plt.figure(figsize=(20,12))
        plt.plot(dates, ys)
        plt.xlabel('date')
        plt.ylabel('execution time')
        plt.grid(True)
        plt.show()
        
    last = None
    for i in range(len(ys)):
        if not last:
            last = ys[i]
        else:
            if abs(last - ys[i]) > 0.8 * max_diff:
                print("{}, {}".format(points_filtered[binary_search(points_filtered, dates[i - 1])][3]["label"], points_filtered[binary_search(points_filtered, dates[i])][3]["label"]))
    
plot_date()

2b9a834c43cb1f93d33958c14b695896bb4e9c1e, 25531a1d9657897e648d93f776a3abb70e9816ef
b3d7e761e347d562333893652dcf3837fa55d777, 8b9df70bf7e7b812715a3dc9772719188e0df06c
1ac98044dfc250a0454762eb49f138b1f3f035e7, 6c097f73ca032e73b2eb4ec21ee9d0773c86d4ed
efb1cb752bf12149af9c773f702a757ba9721649, 5f3c99085d4c2ebf57fd0586b013b02e32a8e20b
b94c215592bdba915455895b2041398dfb2ac44a, 3475159122b656ff098e2f44af32dc56f3beb610
ea1a1ebbc673d810f1abf6cb58a40b5ec916ff07, 4444b343d7e208e6e6f7ee885d380d90df1c231d
4e1c487004a29ec9bc56fd47fc30336d033c57dd, b627802e81ee5ab176f9e8b318e01f73c1e961b9
b26c953f55d659ed5148f38e34716efb696b5016, 84094fb4fd0bc0c1f621757efc328a6ebcf8b847
84094fb4fd0bc0c1f621757efc328a6ebcf8b847, dddd590fd0bcc7e3f0cc9769fe3bad8556aa5c8a
dddd590fd0bcc7e3f0cc9769fe3bad8556aa5c8a, cca3167de0b6f95916fa9d2338beccb74132e526
d7b0c19823892b2c94a9e347dec880a3531980ff, 7c5222e4d1a3a14f029e5f614c9aefd0fa505f1e
504eee28fe0ff4cf582f623b2ee83e9940e33024, 06d06f2f6403066415df7b8854e6aff7586a92df
63cf

In [10]:
def binary_search(arr, x):
    low = 0
    high = len(arr) - 1
    mid = 0
 
    while low <= high:
 
        mid = (high + low) // 2
 
        # If x is greater, ignore left half
        if arr[mid][0] < x:
            low = mid + 1
 
        # If x is smaller, ignore right half
        elif arr[mid][0] > x:
            high = mid - 1
 
        # means x is present at mid
        else:
            return mid
 
    # If we reach here, then the element was not present
    return -1


def plot_ma_seq():
    
    window = 10  # size of the sliding window
    points_filtered.sort(key=lambda p: p[0])
    ys = []
    for p in range(0, len(points_filtered) - window, 5):
        sample = points_filtered[p:p+window]
        ys.append(sum([s[1] for s in sample])/len(sample))
        
    xs = [i for i in range(len(ys))]
#     ys = points_df["execution time"].groupby(np.arange(len(points_df))//10).mean()
#     xs = [10*i for i in range(len(ys))]
#     plt.style.use('seaborn-dark')
#     plt.style.use("tableau-colorblind10")

    if MODE != "text":

        fig = plt.figure(figsize=(20,12))
        ax1 = plt.plot(xs, ys)
        ax1 = plt.title("Execution Time Moving Average", fontsize=22)
        ax1 = plt.xlabel("", fontsize=18)
        ax1 = plt.ylabel("Moving Average", fontsize=18)
        # ax1 = plt.legend(["100 day SMA"],prop={"size":20}, loc="upper left")
        plt.grid(True)
        plt.show()

    max_diff = float("-inf")
    last = None
    for i in range(len(ys)):
        if not last:
            last = ys[i]
        else:
            if abs(last - ys[i]) > max_diff:
                max_diff = abs(last - ys[i])
                
    if path.exists("results/analysis/" + filename + "_analysis_seq.txt"):
        os.remove("results/analysis/" + filename + "_analysis_seq.txt")
        
    last = None
    for i in range(len(ys)):
        if not last:
            last = ys[i]
        else:
            if abs(last - ys[i]) > 0.8 * max_diff:
#                 print(xs[i])
#                 print("{}, {}".format(xs[i - 1], xs[i])) TODO
                last_group = [points_filtered[x] for x in range(5*xs[i-1], 5*xs[i-1] + window)]
                curr_group = [points_filtered[x] for x in range(5*xs[i], 5*xs[i] + window)]
                
                with open("results/analysis/" + filename + "_analysis_seq.txt", "a") as f:
                    #. write analysis
                    f.write("first group: {}\nsecond group: {}\n\n".format([p[3]["label"] for p in last_group], [p[3]["label"] for p in curr_group]))
                    
                if last - ys[i] > 0:
                    # graph trending downward
                    last_group_min = max(last_group, key=lambda x: x[1])
                    curr_group_min = min(curr_group, key=lambda x: x[1])
                else:
                    # graph trending upward
                    last_group_min = min(last_group, key=lambda x: x[1])
                    curr_group_min = max(curr_group, key=lambda x: x[1])
                    
                print("{}, {}".format(last_group_min[3]["label"], curr_group_min[3]["label"]))
                
                
plot_ma_seq()

081ae5fe1aa3ead6d9da75747d3698f09ff89cb9, 139a36454f842c7936ba461279e862488e894637
081ae5fe1aa3ead6d9da75747d3698f09ff89cb9, 285dd08b56215840d721961add4a355b60d673a7
dddd590fd0bcc7e3f0cc9769fe3bad8556aa5c8a, 853a2649160c1c80b9bbd38a20b53ca8fab704e8


In [11]:
def plot_ma_date():

    window = 10  # size of the sliding window
    points_filtered.sort(key=lambda p: p[2])
    ys = []
    for p in range(0, len(points_filtered) - window, 5):
        sample = points_filtered[p:p+window]
        ys.append(sum([s[1] for s in sample])/len(sample))
        
    xs = [i for i in range(len(ys))]
    
    fig = plt.figure(figsize=(20,12))
    ax1 = plt.plot(xs, ys)
    ax1 = plt.title("Execution Time Moving Average", fontsize=22)
    ax1 = plt.xlabel("Sequence", fontsize=18)
    ax1 = plt.ylabel("Moving Average", fontsize=18)
    # ax1 = plt.legend(["100 day SMA"],prop={"size":20}, loc="upper left")
    plt.grid(True)
    plt.show()

    max_diff = float("-inf")
    last = None
    for i in range(len(ys)):
        if not last:
            last = ys[i]
        else:
            if abs(last - ys[i]) > max_diff:
                max_diff = abs(last - ys[i])
                
    if path.exists("results/analysis/" + filename + "_analysis_time.txt"):
        os.remove("results/analysis/" + filename + "_analysis_time.txt")

    last = None
    for i in range(len(ys)):
        if not last:
            last = ys[i]
        else:
            if abs(last - ys[i]) > 0.8 * max_diff:
#                 print(xs[i])
#                 print("{}, {}".format(xs[i - 1], xs[i])) TODO
                last_group = [points_filtered[x] for x in range(5*xs[i-1], 5*xs[i-1] + window)]
                curr_group = [points_filtered[x] for x in range(5*xs[i], 5*xs[i] + window)]
                
                with open("results/analysis/" + filename + "_analysis_time.txt", "a") as f:
                    #. write analysis
                    f.write("first group: {}\nsecond group: {}\n\n".format([p[3]["label"] for p in last_group], [p[3]["label"] for p in curr_group]))
                    
                if last - ys[i] > 0:
                    # graph trending downward
                    last_group_min = max(last_group, key=lambda x: x[1])
                    curr_group_min = min(curr_group, key=lambda x: x[1])
                else:
                    # graph trending upward
                    last_group_min = min(last_group, key=lambda x: x[1])
                    curr_group_min = max(curr_group, key=lambda x: x[1])
                    
                print("{}, {}".format(last_group_min[3]["label"], curr_group_min[3]["label"]))
                
if MODE != "text":
    plot_ma_date()