# Sample Data Extraction
This notebook shows how the sample data was extracted.

In [None]:
#@title Please input your project id
import numpy as np
from google.cloud import bigquery
from google.colab import auth
from google.cloud.bigquery import magics

auth.authenticate_user()
print('Authenticated')
project_id = 'cluster-scheduling-437114' #@param {type: "string"}
# Set the default project id for %bigquery magic
magics.context.project = project_id

# Use the client to run queries constructed from a more complicated function.
client = bigquery.Client(project=project_id)

Authenticated


## Data extraction and preprocessing

The complete database description is in [Google cluster-usage traces v3](https://drive.google.com/file/d/10r6cnJ5cJ89fPWCgj7j4LtLBqYN9RiI9/view).

For the CriticalPath(CPLen) in the [Graphene](https://urldefense.proofpoint.com/v2/url?u=https-3A__www.usenix.org_system_files_conference_osdi16_osdi16-2Dgrandl-2Dgraphene.pdf&d=DwMFaQ&c=009klHSCxuh5AI1vNQzSO0KGjl4nbi2Q0M1QLJX9BeE&r=DEq8DIQPbwANBsyzyzxSQv3mjmXjRODgIYtBTK-gui4&m=078MPcaTX48wul9O9gknhVcO3fsQTA6Ov6JI1in-ecXtU4icJBMG1SmTyloZeqfV&s=-jAA4VvdLT29JG8rZWsfp0NVKuHJ1t9X_nQnkGrCBs0&e=) paper, we need `collection_events` table.



### Extract users that has most complex DAGs

In [None]:
sql_usage = f'''
SELECT
    user,
    SUM(ARRAY_LENGTH(start_after_collection_ids)) AS total_start_after_count
FROM `google.com:google-cluster-data`.clusterdata_2019_a.collection_events
WHERE start_after_collection_ids IS NOT NULL
GROUP BY user
ORDER BY total_start_after_count DESC
LIMIT 20;
    '''

df_usage = client.query(sql_usage).to_dataframe()
print(df_usage)

### One Query Returns ALL Needed Information from Google Cluster

In [None]:
sql_usage = f'''
SELECT
    events.collection_id,
    MIN(events.time) AS start_time,
    MAX(events.time) AS end_time,
    MAX(events.time) - MIN(events.time) AS time_duration,
    AVG(usage.average_usage.cpus) AS avg_cpu_usage,
    AVG(usage.average_usage.memory) AS avg_memory_usage,
    COUNT(events.collection_id) AS record_count,
    ANY_VALUE(events.start_after_collection_ids) AS start_after_collection_ids
FROM `google.com:google-cluster-data`.clusterdata_2019_a.collection_events AS events
JOIN `google.com:google-cluster-data`.clusterdata_2019_a.instance_usage AS usage
    ON events.collection_id = usage.collection_id
WHERE events.collection_id NOT IN (
    SELECT collection_id
    FROM `google.com:google-cluster-data`.clusterdata_2019_a.collection_events
    WHERE time = 0
)
AND events.user = 'oK6/v6yt7HllNcQmV3P6j+1evVLKuGv2+id3XJ0UoAk='
GROUP BY
    events.collection_id;
    '''

df_usage = client.query(sql_usage).to_dataframe()
df_usage.head()
df_usage.to_csv('SAMPLE_combination.csv',index=False)