In [1]:
import os
import pandas as pd
from google.cloud import bigquery as bq

# ref: https://drive.google.com/file/d/1IHYxjtUo5hQjj81CLBH6RNoQM05oojBF/view

""" Dependencies of note:
- google-cloud
- db-dtypes
"""

' Dependencies of note:\n- google-cloud\n- db-dtypes\n'

In [2]:
# CREDENTIAL
""" google cloud credentials--> https://console.cloud.google.com/projectselector2/apis/credentials?supportedpurview=project&authuser=3
- https://developers.google.com/workspace/guides/create-credentials#google-cloud-console
- gcloud account --> make a project --> service account --> api key for service account
Credential Loc:
https://console.cloud.google.com/iam-admin/serviceaccounts/details/113699365795205553034/keys?authuser=3&project=codegen-404518&supportedpurview=project
--> download (https://pypi.org/project/google-cloud-bigquery/)
"""

cwd = os.getcwd()
secret_dir = "secret/"
api_key = cwd + "/" + secret_dir + os.listdir(secret_dir)[0]
assert api_key[-5:] == ".json"  # confirm that it was found
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = api_key

In [3]:
# From Google BigQuery  (https://pypi.org/project/google-cloud-bigquery/)
client = bq.Client()

# use api key for the session
def run_query(query_string):
    print("Running Query:")
    print(query_string)
    print()
    dataframe = (
        client.query(query_string)
        .result()
        .to_dataframe(
            create_bqstorage_client=True,
        )
    )
    print(dataframe.head())
    return dataframe

## Prelim Tests

In [4]:
# query documentaiton: https://googleapis.dev/nodejs/bigquery/latest/BigQuery.html

# query_string="""
# SELECT f.repo_name, f.path, c.copies, c.size, c.content
#  FROM `bigquery-public-data.github_repos.files` AS f 
#  JOIN `bigquery-public-data.github_repos.contents` AS c 
#  ON f.id = c.id 
#  WHERE 
#  NOT c.binary 
#  AND ((f.path LIKE '%.cbl') 
#  AND (c.size BETWEEN 1
#  AND 500))
# """

query_string="""SELECT f.repo_name, c.content
FROM `bigquery-public-data.github_repos.files` AS f
JOIN `bigquery-public-data.github_repos.contents` AS c
ON f.id = c.id
WHERE
NOT c.binary
AND f.path LIKE '%.py'
AND REGEXP_CONTAINS(c.content, r'(?m)^\s*assert ')
LIMIT 100000"""


# query = """
#     SELECT corpus AS title, COUNT(word) AS unique_words
#     FROM `bigquery-public-data.samples.shakespeare`
#     GROUP BY title
#     ORDER BY unique_words
#     DESC LIMIT 10
# """
# results = bqclient.query(query)

In [51]:
df = run_query(query_string)
init_len = len(df)
df

Running Query:
SELECT f.repo_name, c.content
FROM `bigquery-public-data.github_repos.files` AS f
JOIN `bigquery-public-data.github_repos.contents` AS c
ON f.id = c.id
WHERE
NOT c.binary
AND f.path LIKE '%.py'
AND REGEXP_CONTAINS(c.content, r'(?m)^\s*assert ')
LIMIT 100000

                          repo_name  \
0                        tqchen/tvm   
1                    Lujeni/ansible   
2  lukas-hetzenecker/home-assistant   
3              schnoebe/fedora-mock   
4                  samstav/fastfood   

                                             content  
0  # Licensed to the Apache Software Foundation (...  
1  # (c) 2017 Red Hat Inc.\n#\n# This file is par...  
2  """The tests for the Pilight sensor platform."...  
3  import fcntl\nimport glob\nimport grp\nimport ...  
4  # -*- coding: utf-8 -*-\n# Copyright 2015 Rack...  


Unnamed: 0,repo_name,content
0,tqchen/tvm,# Licensed to the Apache Software Foundation (...
1,Lujeni/ansible,# (c) 2017 Red Hat Inc.\n#\n# This file is par...
2,lukas-hetzenecker/home-assistant,"""""""The tests for the Pilight sensor platform.""..."
3,schnoebe/fedora-mock,import fcntl\nimport glob\nimport grp\nimport ...
4,samstav/fastfood,# -*- coding: utf-8 -*-\n# Copyright 2015 Rack...
...,...,...
99995,aESeguridad/GERE,"# -*- coding: utf-8 -*-\n""""""\n flask.ctx\n ..."
99996,danimajo/pineapple_pdf,"# -*- coding: utf-8 -*-\n""""""\n flask.ctx\n ..."
99997,DebrahR/lab4,"# -*- coding: utf-8 -*-\n""""""\n flask.ctx\n ..."
99998,antotodd/project2,"# -*- coding: utf-8 -*-\n""""""\n flask.ctx\n ..."


In [52]:
# df[df.duplicated(subset=["content"], keep="first")]
df.drop_duplicates(subset=["content"], keep="first", inplace=True) # deleting duplicates
df

Unnamed: 0,repo_name,content
0,tqchen/tvm,# Licensed to the Apache Software Foundation (...
1,Lujeni/ansible,# (c) 2017 Red Hat Inc.\n#\n# This file is par...
2,lukas-hetzenecker/home-assistant,"""""""The tests for the Pilight sensor platform.""..."
3,schnoebe/fedora-mock,import fcntl\nimport glob\nimport grp\nimport ...
4,samstav/fastfood,# -*- coding: utf-8 -*-\n# Copyright 2015 Rack...
...,...,...
99737,raphaelm/django-i18nfield,from i18nfield.admin import I18nModelAdmin\nfr...
99738,fniephaus/alfred-rworkflow,# The MIT License (MIT)\n#\n# Copyright (c) 20...
99748,bgris/ODL_bgris,# -*- coding: utf-8 -*-\r\n#\r\n# Copyright © ...
99751,chrsrds/scikit-learn,"""""""\nTesting for the base module (sklearn.ense..."


In [53]:
print("duplicate ratio = ", (len(df)/init_len))

duplicate ratio =  0.33793


In [11]:
conditionals = dict([[cond, i] for i, cond in enumerate(["==", "!=", "<=", ">=", "<", ">"])])
compounding_statements = ["and"]  # TODO: properly account for OR
bad_statements = ["or"]  # TODO: check for more than just 1 index

def get_assertions(func, is_split=True):
    """
    Format: "assert [expression], [return_string]"
    """
    out = []
    lines = [temp.strip() for temp in func.split('\n') if "assert" in temp and bad_statements[0] not in temp]
    # TODO: maybe save some context around the assertion?
    ind = 0
    while ind < len(lines):
        data = lines[ind].strip()
        # TODO: handle or & and statements!
        start = data.find('assert')
        if start != -1:
            # account for combination statements
            for statement in compounding_statements:
                add_statement = data.find(statement)
                if add_statement != -1:
                    extra_line = data[add_statement+len(statement):]
                    lines.insert(ind+1, "assert "+extra_line)
                    data = data[:add_statement].strip()
            
            com = data.find(',')   # parsing out return_string
            if com != -1:
                data = data[:com]

            if is_split:
                data = [var.strip() for var in data.split()]
                assert data[0] == "assert", "something was found before the assertion in this line"
                data = data[1:]
                
                condition = True  # assertion [variable] == condition by default
                if data[0] == "not":  # accounting for not
                    condition = False
                    data = data[1:]
                    
                assert len(data) >= 1, "empty assertion found?: " + data
                if len(data) == 1:  # adding == to simlify
                    data = data + ["==", str(condition)]
                
                for i in range(len(data)):
                    if data[i] == "is":  # simplifying is to ==
                        data[i] = "=="
                    if data[i] in conditionals.keys():  # com
                        data = [' '.join(data[:i]), data[i], ' '.join(data[i+1:])]  # conditionals[data[i]]
                        break
            
            if len(data) != 3:
                print("Weird assertion found:\n", data, '\n', '\n'.join(lines[ind-1:ind+2]))
                print()
#             assert len(data) == 3, "found conditional-less assertion:\n" + str(data) + '\n' + str(lines[ind-1:ind+2])
            else:
                out.append(data)
        ind += 1
    return out

df["assertions"] = df["content"].apply(lambda code: get_assertions(code))
df

Weird assertion found:
 ['"Setup', 'of', 'domain', 'minio', 'took"', 'in', 'caplog.text'] 
 

Weird assertion found:
 ['"Setup', 'of', 'domain', 'minio', 'took"', 'in', 'caplog.text'] 
 assert minio_client.remove_object.call_args == call("some_bucket", "some_key")
assert "Setup of domain minio took" in caplog.text
assert 1 == len(events)



AssertionError: something was found before the assertion in this line

In [None]:
banned_vars = ['', '*', 'self']
def get_variables(func, verbose=False):
    out = []
    for line in func.split('\n'):
        line = line.strip()
        if "def " in line:  # add params if its a function
            start = line.find('(')
            end = line.find(')')
            for new_param in line[start+1:end].split(','):
                default = new_param.find("=")
                if default != -1:
                    new_param = new_param[:default]
                new_param = new_param.strip()
                if new_param not in out and new_param not in banned_vars:
                    if verbose:
                        print("*Found  {", new_param, "}  at:\n", line, '\n')
                    out.append(new_param)
        else: # add variables if equals operation
            find_var = line.find(' = ')
            if find_var != -1:
                new_var = line[:find_var].strip()
                
                if ',' in new_var: # handle tuple equalities edge case (ex: a, b, c = fn_output())
                    var_list = [tuple_var.strip() for tuple_var in new_var.split(',')]
                else:
                    var_list = [new_var]
                for new_var in var_list:
                    if new_var not in out and new_var not in banned_vars:
                        if verbose:
                            print("**Found  {", new_var, "}  at:\n", line, '\n')
                        out.append(new_var)
            # TODO: handle indexing
    return out

# out = get_variables(df.sample()["content"].iloc[0])
get_vars = lambda code: get_variables(code)
df["variables"] = df["content"].apply(get_vars)
df

## TODO: figure out filtering constraints that make for good assertions
- Goodness Criteria: (what is a good reference to optimize LLMs with
    - addative --> look at some edge case
    - comprehensive --> > 1 assertion in code block
    - fits our schema
        - assert [A,B,C, (int)] [==, >=, <=, !=] [A,B,C, (int)]
    - 

## Observations:
    - columns = 'repo_name', 'ref', 'path', 'mode', 'id', 'symlink_target', 'id_1', 'size', 'content', 'binary', 'copies'
    - variable data for comparison
        - len()
        - element index  (ex: myData[ind])
        - boolean function  (ex: myClass.isValid())
        
        
        
### Ideas: (to improve quality of asserted code collected)
    - order results by quantity of asserted lines to lines of code in the repository
        - order results by optimal variable complexity? 2 < #vars < 6
        - split results by 
    - weight asserted lines by how neatly they fit our schedma (assert a [cond] b)

In [49]:
# save work
save_dir = "BigQuery/PythonAssertions100k.csv"
df.to_csv(save_dir, index=False)
print("Dataframe Saved To", save_dir)
df

Dataframe Saved To BigQuery/VerilogAssertions-ALL.csv


Unnamed: 0,repo_name,content
0,18-341/Router,"`default_nettype none\n`include ""RouterPkg.pkg..."
1,swallat/yosys,"module top (\n input clk, rst,\n output reg ..."
2,xuwenyihust/MapReduce_NoC,/******************FIFO_MUX*******************...
3,TheClams/SystemVerilog,module assertions_test #(parameter SIZE = pa_t...
4,mda-ut/Tempest,// (C) 2001-2013 Altera Corporation. All right...
...,...,...
924,litex-hub/pythondata-cpu-blackparrot,"\n`include ""bp_common_defines.svh""\n`include ""..."
925,litex-hub/pythondata-cpu-blackparrot,"\n`include ""bp_common_defines.svh""\n`include ""..."
927,litex-hub/pythondata-cpu-blackparrot,/**\n * Name:\n * bp_lce_req.v\n *\n * De...
928,litex-hub/pythondata-cpu-blackparrot,"\n`include ""bp_common_defines.svh""\n`include ""..."


In [None]:
pd.read_csv(save_dir)