In [9]:
%%writefile ./SQL/Github_code.sql

# break out individual lines of code into new lines
WITH lines_of_code AS (
    SELECT 
        SPLIT(content, "\n") AS line,
        sample_path,
        sample_repo_name
    FROM `bigquery-public-data.github_repos.sample_contents`
),

# lets flatten the array to we can parse it more easily
flattened_lines_of_code AS (
    SELECT 
        flattened_line,
        sample_path,
        sample_repo_name
    FROM lines_of_code, UNNEST(line) AS flattened_line
),

# parse the first character from every line of code
parse_first_character AS (
    SELECT 
        SUBSTR(flattened_line, 1, 1) AS first_character,
        flattened_line,
        sample_path,
        sample_repo_name
    FROM flattened_lines_of_code
),

# filter for code lines that begin with tab or space only
tabs_or_spaces AS(
    SELECT 
        first_character,
        IF(REGEXP_CONTAINS(first_character, r"[\t]"), 1, 0) AS tab_count,
        IF(REGEXP_CONTAINS(first_character, r"[ ]"), 1, 0) AS space_count,
        flattened_line,
        sample_path,
        sample_repo_name
    FROM parse_first_character
    WHERE REGEXP_CONTAINS(first_character, r"[ \t]")
),

# aggregate and filter by entire code file
tabs_or_spaces_count AS (
    SELECT 
        COUNT(flattened_line) AS lines,
        SUM(tab_count) AS tabs_count,
        SUM(space_count) AS space_count,
        IF(SUM(tab_count) > SUM(space_count), 1, 0) AS tab_winner,
        IF(SUM(tab_count) < SUM(space_count), 1, 0) AS space_winner,
        REGEXP_EXTRACT(sample_path, r"\.([^\.]*)$") AS extension,
        sample_path,
        sample_repo_name
    FROM tabs_or_spaces 
    GROUP BY sample_path, sample_repo_name
    HAVING tabs_count > 10 OR space_count > 10
),

# aggregate all files by code extension (.jave etc.)
tabs_or_spaces_by_extension AS (
    SELECT 
        extension,
        COUNT(lines) AS files,
        SUM(lines) AS lines,
        SUM(tab_winner) AS tabs,
        SUM(space_winner) AS spaces,
        LOG((SUM(space_winner)+1)/(SUM(tab_winner)+1)) AS lratio
    FROM tabs_or_spaces_count
    GROUP BY extension
    ORDER BY files DESC
    LIMIT 100
)

# Format() for demo readability on screen, don't use otherwise.
# Leave that for Data Studio
SELECT
    extension,
    FORMAT("%d", files) AS files,
    FORMAT("%d", lines) AS lines,
    FORMAT("%d", tabs) AS tabs,
    FORMAT("%d", spaces) AS spaces,
    ROUND(lratio, 5) AS lratio
FROM tabs_or_spaces_by_extension

Writing ./SQL/github_code.sql


In [10]:
%%bash

bq query --use_legacy_sql=false < ./SQL/Github_code.sql

+-----------+--------+----------+-------+--------+----------+
| extension | files  |  lines   | tabs  | spaces |  lratio  |
+-----------+--------+----------+-------+--------+----------+
| java      | 238766 | 32464537 | 51501 | 186910 |  1.28901 |
| h         | 123414 | 16480082 | 27431 | 95737  |  1.24991 |
| js        | 115425 | 51585507 | 20478 | 94919  |  1.53363 |
| c         | 108459 | 62654021 | 59154 | 49146  | -0.18535 |
| php       | 82305  | 12923886 | 18628 | 63613  |  1.22811 |
| html      | 81238  | 20811351 | 8924  | 72292  |  2.09187 |
| cs        | 77569  | 11744735 | 15010 | 62523  |  1.42677 |
| json      | 68544  | 15289837 | 3662  | 64882  |   2.8743 |
| py        | 66701  | 11823103 | 2870  | 63827  |  3.10153 |
| cpp       | 65296  | 19463475 | 17467 | 47779  |  1.00624 |
| xml       | 53975  | 15366142 | 9649  | 44303  |  1.52412 |
| rb        | 35732  | 3089709  | 446   | 35286  |  4.36871 |
| cc        | 23304  | 6762131  | 1666  | 21623  |  2.56278 |
| go    

Waiting on bqjob_r7a69d05c17ba7834_00000177eb8d1ea4_1 ... (0s) Current status: DONE   


In [11]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/enlinea/.GCP_SA/lemon79_key.json"

In [12]:
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# Make an API request.
datasets = list(client.list_datasets())
project = client.project

if datasets:
    print("Datasets in project {}".format(project))
    for dataset in datasets:
        print("\t{}".format(dataset.dataset_id))
else:
    print("{} project does not contain any datasets.".format(project))

Datasets in project healthy-spark-305704
	babynames


In [20]:
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

with open('./SQL/Github_code.sql', 'r') as file:
    query = file.read()

query_job = client.query(query)  # Make an API request.

print("The query data:")
for row in query_job:
    # Row values can be accessed by field name or index.
    print(row.extension, row.files, row.lines, row.tabs, row.spaces, row.lratio)

The query data:
java 238766 32464537 51501 186910 1.28901
h 123414 16480082 27431 95737 1.24991
js 115425 51585507 20478 94919 1.53363
c 108459 62654021 59154 49146 -0.18535
php 82305 12923886 18628 63613 1.22811
html 81238 20811351 8924 72292 2.09187
cs 77569 11744735 15010 62523 1.42677
json 68544 15289837 3662 64882 2.8743
py 66701 11823103 2870 63827 3.10153
cpp 65296 19463475 17467 47779 1.00624
xml 53975 15366142 9649 44303 1.52412
rb 35732 3089709 446 35286 4.36871
cc 23304 6762131 1666 21623 2.56278
go 22872 4382140 22424 436 -3.938
None 20726 5341188 8087 12618 0.44482
m 19729 2502925 1796 17922 2.29997
md 19563 1608360 1885 17662 2.23701
txt 16921 2852997 2552 14361 1.72732
hpp 14857 2357425 2235 12606 1.72956
css 13477 4800271 3303 10164 1.12382
svg 13470 3983778 2405 11064 1.52582
scala 10291 1045848 131 10159 4.34341
yml 9394 1295352 0 9394 9.14793
swift 6774 723382 379 6394 2.8231
htm 6313 821552 945 5368 1.73615
ts 6293 3701710 806 5487 1.917
csproj 6162 803362 15 6147 5