# Imports

In [1]:
import SciServer
from SciServer import CasJobs  # Communicate between SciServer Compute and CasJobs
from SciServer import Authentication  # Authenticate users
from decouple import config
import pandas as pd
import numpy as np
import datetime
import os

# Authentication

If you have the token, you can authenticate with the following code:

In [4]:
token = config('SDSS_TOKEN')
Authentication.setToken(token)

In [5]:
myquery = "select top 10 objid, ra, dec, r "  # note the space at the end of this string - important
myquery += "from galaxy "
myquery += "where clean = 1"

df = CasJobs.executeQuery(sql=myquery, context="DR18")
df

Exception: Error when executing query. Http Response from CasJobs API returned status code 401:
{"Error Code":401,"Error Type":"Unauthorized","Error Message":"Authentication failed: Response status code does not indicate success: 500 ().","LogMessageID":"8b2cf32c-6a18-47f5-b7e5-ecd56aad50fc"}

If you don't have a token, it can be generated by logging:

In [2]:
user_name = config('SDSS_USERNAME')
password = config('SDSS_PASSWORD')
token = Authentication.login(user_name, password)

Note that you don't need to set the token if you are just logging in. If user name and password are not provided in `login`, the user will be prompted to enter them which is more secure.

In [None]:
token = Authentication.login()

# Some Functions

Here, some functions can be found to help with the data download.

In [52]:
def job_describer(job_description):
    """Prints the results of the CasJobs job status functions in a human-readable manner

    Parameters
    ----------
    job_description : dict
        Dictionary of job status information returned by CasJobs
    
    Returns
    -------
    None
    """

    code_to_status = {
        0: "Not Started",
        1: "Started",
        5: "Completed",
        2: "Cancelling",
        3: "Cancelled",
        4: "Failed",
    }
    status = job_description["Status"]
    if status in code_to_status:
        status = code_to_status[status]
    else:
        status = "Unknown"
    
    if job_description["Status"] == 0:
        print("The Job has not started yet.")
        return 
    
    job_ID = job_description["JobID"]
    target = job_description["Target"]
    message = job_description["Message"]
    create_table = job_description["Created_Table"]
    rows = job_description["Rows"]
    start_time = pd.to_datetime(job_description["TimeStart"])
    end_time = pd.to_datetime(job_description["TimeEnd"])
    submit_time = pd.to_datetime(job_description["TimeSubmit"])

    wait_time = (start_time - submit_time).total_seconds()
    run_time = (end_time - start_time).total_seconds()

    print(f"Job ID: {job_ID}")
    print(f"Status: {status}")
    print(f"Target: {target}")
    print(f"Create Table: {create_table}")
    print(f"Rows: {rows}")
    print(f"Message: {message}")
    print(f"Submit Time: {submit_time}")
    print(f"Start Time: {start_time}")
    print(f"End Time: {end_time}")
    print(f"Wait Time: {wait_time} seconds")
    print(f"Run Time: {run_time} seconds")

In [4]:
def tables_format(tables):
    """Formats the table dictionary returned by `CasJobs.getTables()` to be more human-readable
    
    Parameters
    ----------
    tables : dict
        Dictionary of table information returned by CasJobs
    
    Returns
    ---------
    None
    """
    tables = sorted(tables, key=lambda k: k["Name"])  # alphabetize by table name

    tables_final = pd.DataFrame()
    names = []
    rows = []
    sizes = []
    createds = []

    for table in tables:
        names.append(table["Name"])
        rows.append(table["Rows"])
        sizes.append(table["Size"])

        create_date = table["Date"]
        create_sec = (
            create_date / 10000000
        ) 
        firstday = datetime.datetime(1, 1, 1, 0, 0)  # Save 1 AD as "firstday"
        created = firstday + datetime.timedelta(
            seconds=create_sec
        )
        createds.append(created.strftime("%Y-%m-%d %H:%M:%S"))
    tables_final["Name"] = names
    tables_final["Rows"] = rows
    tables_final["Size (KB)"] = sizes
    tables_final["Created"] = createds
    return tables_final


# Working With CasJobs

## Contexts

CasJobs allows you to search many different datasets, referred to as <strong>contexts</strong> (they are known as contexts rather than databases, so that they can be described independently of the databases in which they are stored). Each context consists of one or more tables containing data or metadata related to a single aspect of the full dataset.

To get a list of contexts, you wil need to log in to <a href="http://skyserver.sdss.org/casjobs/" target="_blank">CasJobs</a>. Once you know what context you want to search, you can use the <strong>CasJobs.getTables(context)</strong> function to show the data tables in that context. The function returns a dictionary with:
<ul>
<li><em>Date:</em> the number of 10-millisecond intervals since the table was created</li>
<li><em>Name:</em> the name of the table</li>
<li><em>Rows:</em> the number of rows in the table</li>
<li><em>Size:</em> the size of the table in kilobytes</li>
</ul>

Let's look at an example:

In [19]:
this_context = 'DR18'   # SDSS Data Release 18

tables = CasJobs.getTables(context=this_context)

tables = tables_format(tables)
tables

Unnamed: 0,Name,Rows,Size (KB),Created
0,AtlasOutline,1222390340,800358592,2017-03-13 16:42:32
1,DBColumns,14012,2552,2022-12-22 18:06:39
2,DBObjects,700,392,2011-12-06 15:44:51
3,DBViewCols,234,72,2011-12-06 15:45:07
4,DataConstants,744,600,2023-01-17 14:57:54
...,...,...,...,...
209,zooMirrorBias,91303,21104,2011-12-06 15:44:48
210,zooMonochromeBias,91303,17968,2011-12-06 15:44:48
211,zooNoSpec,225268,33136,2011-12-06 15:44:48
212,zooSpec,667944,96688,2011-12-06 15:44:48


## Small Queries

Now that you know what contexts (datasets) are available to you, and you know what tables can be found in those contexts, you are ready to write and submit a query to that context. A query is a request for data, written in SQL (Structured Query Language), a programming language designed for efficient database searches.

Once you have written a query, you can get results by running (executing) it in CasJobs. To run a query in CasJobs directly from a Code cell in SciServer Compute, use the <strong><code>CasJobs.executeQuery(sql,...)</code></strong> function. The function takes as input a string containing a properly-formatted SQL query (and optional parameters listed below), and returns a table containing query results (in one of several formats with a default of a <a href="https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html" target="_blank">pandas dataframe</a>).

The <em>sql</em> parameter is required. The <em>context</em> parameter is recommended to explicitly state the context to which the query will be submitted; the default value is 'MyDB'.

Let's Look at an example:

In [6]:
q = """select top 10 objid, ra, dec, type, u, g, r
from PhotoPrimary
where clean = 1"""
df = CasJobs.executeQuery(sql=q, context="DR18")
df

Unnamed: 0,objid,ra,dec,type,u,g,r
0,1237660750333018120,66.418117,22.681579,3,23.24054,23.63439,21.88248
1,1237660750333018127,66.300065,22.684366,6,18.28234,16.73006,16.0216
2,1237660750333018129,66.343521,22.685372,6,18.95296,17.18355,16.34626
3,1237660750333018131,66.329857,22.694544,6,17.6304,16.08851,15.35523
4,1237660750333018134,66.304717,22.69495,6,18.76969,16.18015,15.0016
5,1237660750333018135,66.300755,22.695914,3,23.397,24.15585,22.46271
6,1237660750333018138,66.427928,22.697852,6,19.25323,17.68925,16.92312
7,1237667725359906851,82.86383,9.533257,6,22.42331,19.43157,17.79978
8,1237667725359906852,82.864155,9.529703,6,26.47945,25.12807,19.42809
9,1237667725359906853,82.861126,9.519941,6,21.55026,19.91881,19.09043


For schema, have a look [here](https://skyserver.sdss.org/CasJobs/SchemaBrowser.aspx).

We can change the `format` parameter to change the format of result returned.

In [7]:
q = """select top 10 objid, ra, dec, type, u, g, r
from PhotoPrimary
where clean = 1"""
fits = CasJobs.executeQuery(sql=q, context="DR18", format="fits")

The `fits` object has `.read()` method which convertes it to a byte type object which can be written to files.

In [12]:
with open("/home/hari31416/Desktop/res.fits", "wb") as f:
    f.write(fits.read())

## Large Queries

The example above shows a quick query. Quick queries are limited to 60 seconds of processing time. For longer queries, CasJobs has a system of <strong>jobs</strong>. When you submit a query to CasJobs, the system creates a job for your query, with a unique <code>jobId</code>. CasJobs then runs this job in the background as server resources permit. When a job completes, it writes the query results into your MyDB personal database space, into a table that you specify. You can later retrieve and use the query results associated with the job by querying that table in your MyDB (that is, by executing a query with <code>context='MyDB'</code>).

To submit a job, use the `CasJobs.submitJob(sql, context)` to submit the job. `context` is the database used to save the output. You need to make a query such that it writes a table to be saved. See below for an example:

In [71]:
mydb = "MyDB"
context = "DR18"
table_name = "mytable"
q = f"""--
SELECT TOP 10000 objid, ra, dec, u, g, r, i, z, err_u, err_g, err_r, err_i, err_z, petror90_r
INTO {mydb}.{table_name}
FROM PhotoPrimary
WHERE clean = 1"""
print(q)

--
SELECT TOP 10000 objid, ra, dec, u, g, r, i, z, err_u, err_g, err_r, err_i, err_z, petror90_r
INTO MyDB.mytable
FROM PhotoPrimary
WHERE clean = 1


In [72]:
job_id = CasJobs.submitJob(sql=q, context=context)

### Checking Status

When you submit a job to CasJobs, it runs in the background. The time required to finish a job is hard to predict, because it can vary widely based on the efficiency of your query and the current load on the servers that power CasJobs. There are some functions which can be used to check the progress.

The function `CasJobs.waitForJob` keeps running if the job is running and returns a dictionary with metadata related to the job if it is completed.

In [64]:
job_description = CasJobs.waitForJob(jobId=job_id)

In [65]:
job_description

{'JobID': 60004888,
 'Rows': None,
 'WebServicesID': 1676402349,
 'TimeSubmit': '2023-04-01T07:24:13.337',
 'TimeStart': '2023-04-01T07:24:20.323',
 'TimeEnd': '2023-04-01T07:24:20.443',
 'SendEmail': 0,
 'Status': 4,
 'AutoComplete': 0,
 'Estimate': 500,
 'TaskName': 'SciScript-Python.CasJobs.submitJob',
 'OutputLoc': '',
 'HostIP': 'DR18_long      ',
 'Message': "There is already an object named 'mytable' in the database.",
 'Query': 'DROP TABLE IF EXISTS mytable\nSELECT TOP 10000 objid, ra, dec, u, g, r, i, z, err_u, err_g, err_r, err_i, err_z, petror90_r\nINTO MyDB.mytable\nFROM PhotoPrimary\nWHERE clean = 1',
 'ModifiedQuery': 'DROP TABLE IF EXISTS mytable\nSELECT TOP 10000 objid, ra, dec, u, g, r, i, z, err_u, err_g, err_r, err_i, err_z, petror90_r\n\nFROM PhotoPrimary\nWHERE clean = 1/*CASJOBS_INTO_TABLE:[mydbsql].mydb_1676402349.webuser.mytable*/',
 'Target': 'DR18',
 'OutputType': 'QUERY',
 'Params': '{\r\n  "token": "60d344858c5a40248de3a96233b972b8"\r\n}',
 'Created_Table': 

The `CasJobs.getJobStatus` return the same dictionary. Both these functions takes the JobID as parameter.

In [73]:
job_description= CasJobs.getJobStatus(job_id)
job_description

{'JobID': 60004924,
 'Rows': 10000,
 'WebServicesID': 1676402349,
 'TimeSubmit': '2023-04-01T07:33:53.97',
 'TimeStart': '2023-04-01T07:33:57.697',
 'TimeEnd': '2023-04-01T07:33:58.15',
 'SendEmail': 0,
 'Status': 5,
 'AutoComplete': 0,
 'Estimate': 500,
 'TaskName': 'SciScript-Python.CasJobs.submitJob',
 'OutputLoc': '',
 'HostIP': 'DR18_long      ',
 'Message': 'Query Complete',
 'Query': '--\nSELECT TOP 10000 objid, ra, dec, u, g, r, i, z, err_u, err_g, err_r, err_i, err_z, petror90_r\nINTO MyDB.mytable\nFROM PhotoPrimary\nWHERE clean = 1',
 'ModifiedQuery': 'SELECT TOP 10000 objid, ra, dec, u, g, r, i, z, err_u, err_g, err_r, err_i, err_z, petror90_r\n\nFROM PhotoPrimary\nWHERE clean = 1/*CASJOBS_INTO_TABLE:[mydbsql].mydb_1676402349.webuser.mytable*/',
 'Target': 'DR18',
 'OutputType': 'QUERY',
 'Params': '{\r\n  "token": "60d344858c5a40248de3a96233b972b8"\r\n}',
 'Created_Table': 'mytable'}

As we already wrote a function to help us understand the message, we'll use it:

In [74]:
job_describer(job_description=job_description)

Job ID: 60004924
Status: Completed
Target: DR18
Create Table: mytable
Rows: 10000
Message: Query Complete
Submit Time: 2023-04-01 07:33:53.970000
Start Time: 2023-04-01 07:33:57.697000
End Time: 2023-04-01 07:33:58.150000
Wait Time: 3.727 seconds
Run Time: 0.453 seconds


Now that the query is complete, you can load the dataframe easily:

In [75]:
q = f"""SELECT TOP 10
*
FROM {table_name}
"""
info = CasJobs.executeQuery(sql=q, context=mydb)
info

Unnamed: 0,objid,ra,dec,u,g,r,i,z,err_u,err_g,err_r,err_i,err_z,petror90_r
0,1237660750333018120,66.418117,22.681579,23.24054,23.63439,21.88248,22.34249,21.89589,0.628364,0.541069,0.170031,0.369091,0.743648,2.376178
1,1237660750333018127,66.300065,22.684366,18.28234,16.73006,16.0216,15.70814,15.554,0.012328,0.003998,0.004061,0.00411,0.005306,1.42038
2,1237660750333018129,66.343521,22.685372,18.95296,17.18355,16.34626,15.9623,15.7317,0.017384,0.004475,0.004213,0.004178,0.005673,1.429133
3,1237660750333018131,66.329857,22.694544,17.6304,16.08851,15.35523,15.01609,14.83132,0.009032,0.003612,0.003785,0.003845,0.004392,1.422237
4,1237660750333018134,66.304717,22.69495,18.76969,16.18015,15.0016,14.47354,14.14539,0.015774,0.003674,0.003754,0.003786,0.003928,1.416606
5,1237660750333018135,66.300755,22.695914,23.397,24.15585,22.46271,20.93971,20.07823,0.420967,0.466979,0.167785,0.062532,0.101894,1.724525
6,1237660750333018138,66.427928,22.697852,19.25323,17.68925,16.92312,16.55576,16.35126,0.020776,0.005259,0.004661,0.00458,0.00721,1.439882
7,1237667725359906851,82.86383,9.533257,22.42331,19.43157,17.79978,17.1843,16.7574,1.357948,0.042483,0.010455,0.008886,0.012112,1.703876
8,1237667725359906852,82.864155,9.529703,26.47945,25.12807,19.42809,18.32331,17.85698,1.859318,3.827728,0.039324,0.02112,0.027512,1.393616
9,1237667725359906853,82.861126,9.519941,21.55026,19.91881,19.09043,18.64154,18.53338,0.641612,0.067125,0.030501,0.028958,0.050236,2.158507
