In [72]:
import pandas as pd
import requests

In [73]:
OBS_CAT = r"../data/AS_observations_cat_Sept2018.txt"
BIB_CAT = r"../data/AS_publications2019-21.txt"

In [74]:
def byteIndexDict(path:str, lines:list, byteC:list, labelC:list)->dict:
    """
    Reads the README file for the byte-to-byte description of the
    dat files and makes a dictionary of data labels and their indices.
    To be used inside a function to retrieve data from dat file and
    make a DataFrame object out of it.
    
    path - path to the README file
    lines - list containing starting and ending index of relevant lines
    byteC - column number of the byte indices
    labelC - column number of the label
    """
    with open(path,"r") as f: 
        def isValid(line):
            return line[byteC[0]:byteC[1]]!=7*" " and line[labelC[0]:labelC[1]]!=3*"-"
        descLines = f.readlines()[lines[0]:lines[1]]
        byteIndex = [[int(byt) for byt in line[byteC[0]:byteC[1]].split("-")] 
                     for line in descLines if isValid(line)]
        dataLabel = [line[labelC[0]:labelC[1]].strip() 
                     for line in descLines if isValid(line)]
        byteDict = {}
        for _ in range(len(byteIndex)):
            byteDict[dataLabel[_]]=byteIndex[_]
        return byteDict

In [75]:
testDict = byteIndexDict(r"../data/ReadMe", [64,130], [1,8], [23,32])

In [76]:
def makeObsDataFrame(path:str, desc:dict) -> pd.DataFrame:
    """
    Takes in .dat file and a byte-to-byte description dict
    to yeild a DataFrame equivalent of the dat file.
    
    path - path to the dat file
    desc - dict file containing the label and indices of data
    """
    dataArray = []
    with open(path, 'r') as f:
        for line in f.readlines():
            line = line[:-2]
            data = []
            for label in list(desc):
                index = desc[label]
                if len(index)==1:
                    index *= 2
                data.append(line[index[0]-1:index[1]].strip())
            dataArray.append(data)
        df = pd.DataFrame(data=dataArray, columns=list(desc))
        return df

In [77]:
lmxb = makeObsDataFrame(r"../data/lmxbcat.dat",testDict);
hmxb = makeObsDataFrame(r"../data/hmxbcat.dat",testDict);

In [9]:
def _coordsQuery(data: pd.DataFrame, n: int, rad: float):
    """
    Transient function to be used inside coordinatesQuery
    for abstraction purposes.
    
    rad - the radius used in SIMBAD queries
    """
    line = data.iloc[n]
    PREFIX = r"http://simbad.u-strasbg.fr/simbad/sim-coo?output.format=ASCII&Coord="
    query = requests.get(PREFIX+r"{}%20{}%20{}%20{}{}%20{}%20{}&Radius={}".format(
        line["RAh"],line["RAm"],line["RAs"],line["DE-"],line["DEd"],line["DEm"],line["DEs"],rad))
    return query.text

In [10]:
def coordinatesQuery(data:pd.DataFrame, n:int):
    """
    Queries SIMBAD for data at n-th row in data object
    by choosing a suitable radius of query so as to
    obtain the unique result.
    
    data - DataFrame object of the observation data
    n - row index of object to query
    """
    rad = 0.01
    STEP = 0.005
    query = _coordsQuery(data, n, rad)
    def isFiltered(inp):
        if inp[:33]=="!! No astronomical object found :":
            return 2
        elif inp.split("\n\n")[2][:19]=="Number of objects :":
            return -1
    while(bool(isFiltered(query))):
        rad += STEP*isFiltered(query)
        query = _coordsQuery(data, n, rad)
    return query

In [40]:
def _idQuery(id:str):
    """
    Transient function to be used inside coordinatesQuery
    for abstraction purposes.
    
    rad - the radius used in SIMBAD queries
    """
    PREFIX = r"http://simbad.u-strasbg.fr/simbad/sim-id?"
    FORMAT = r"output.format=ASCII&Ident="
    query = requests.get(PREFIX+FORMAT+id)
    return query.text

In [60]:
def _coordinatesQuery(data:pd.DataFrame, n:int, flag):
    """
    Queries SIMBAD for data at n-th row in data object
    by choosing a suitable radius of query so as to
    obtain the unique result.
    
    data - DataFrame object of the observation data
    n - row index of object to query
    """
    rad = 0.01
    STEP = 0.001
    query = _coordsQuery(data, n, rad)
    count = [0,0]
    factor = 0
    shdLoop = True
    while(shdLoop):
        if query[0]=="!":
            count[0]+=1
            if count[0] > 3:
                factor = count[0]**2
            else:
                factor = 1
        elif query.split("\n\n")[2][0]=="N":
            count[1]+=1
            if count[1] > 3:
                factor = -count[1]**2
            else:
                factor = -1
        else:
            shdLoop = False
            continue
        if count[1] > 15 and factor < 0:
            identifier = query.split("\n\n")[3].split("\n")[2].split("|")[2].strip()
            query = _idQuery(identifier)
            break
        rad += STEP*factor
        if rad < 0:
            rad = rad*STEP
        if flag:
            print(rad)
        query = _coordsQuery(data, n, rad)
    return query

In [9]:
def getIdentifier(data:pd.DataFrame, n:int):
    """
    Retrieves a list of identifiers of the object in interest
    
    data - DataFrame object containing the observation data
    n - the object of observation in interest
    """
    identifiers = []
    for fields in _coordinatesQuery(data, n, 0).split("\n\n"):
        if fields[:11]=="Identifiers":
            lines = fields.split("\n")
            for j in lines[1:]:
                for k in range(3):
                    identifiers.append(j[k*32:(k+1)*32].strip())
    return [i for i in identifiers if i!=""]

In [10]:
def getBibcodes(data:pd.DataFrame, n:int):
    """
    Retrieves a list of bibcodes of the object in interest
    
    data - DataFrame object containing the observation data
    n - the object of observation in interest
    """
    identifiers = []
    for fields in _coordinatesQuery(data, n, 0).split("\n\n"):
        if fields[:8]=="Bibcodes":
            lines = fields.split("\n")
            for j in lines[1:]:
                for k in range(4):
                    identifiers.append(j[k*21:(k+1)*21].strip())
    return [i for i in identifiers if i!=""]

In [40]:
def makeAstDataFrame(path:str) -> pd.DataFrame:
    """
    Used to obtain a DataFrame object of the astrosat
    catalog of observations.
    
    path - path to the text file containing the catalog
    """
    astArray = []
    with open(path, "r") as f:
        for line in f.read().strip().split("\n"):
            datas = line.strip().split("\t")
            datas = datas[1:6]+datas[6].split(":: ")+datas[7:]
            if datas[-1] == None:
                datas.insert(5, "")
                del datas[-1]
            astArray.append(datas)
        df = pd.DataFrame(data=astArray)
        return df

In [None]:
def isObserved(data:pd.DataFrame, obs:pd.DataFrame, n:int) -> bool:
    """
    Used to check if the object of interest is observed by Astrosat.
    
    data - the DataFrame of obsevations
    obs - the DataFrame of observations by Astrosat
    n - the object of interest
    """
    identifiers = getIdentifier(data, n)
    for identifier in identifiers:
        if identifier in list(obs[6]):
            return True
        else:
            return False

In [35]:
def makeBibDataFrame(path:str) -> pd.DataFrame:
    """
    Used to obtain a DataFrame object of the astrosat
    publication catalog.
    
    path - path to the text file containing the catalog
    """
    bibArray = []
    with open(path, "r", encoding='utf-8') as f:
        LABELS = ["title","authors","bibcode","url"]
        file = f.read().strip()
        papers = file.split("\n\n\n")
        for paper in papers:
            fields = paper.split("\n")
            fields = fields[:3]+[fields[-1]]
            fields[0] = fields[0].strip()[7:].strip()
            fields[1] = fields[1].strip()[9:].strip()
            fields[2] = fields[2].strip().replace("Bibliographic Code:","").strip()
            fields[3] = fields[3].strip().replace("URL: <a href=\"","")[:-26].strip()
            bibArray.append(fields)
        df = pd.DataFrame(data=bibArray,columns=LABELS)
        return df

In [None]:
def isReferred(data:pd.DataFrame, bib:pd.DataFrame, n:int) -> bool:
    """
    Used to check if the object of interest is referred in any
    of the papers in the catalog.

    data - the DataFrame of obsevations
    bib - the DataFrame of publications referring astrosat data
    n - the object of interest
    """
    bibcodes = getBibcodes(data, n)
    for bibcode in bibcodes:
        if bibcode in list(bib["bibcode"]):
            return True
        else:
            return False

In [None]:
def combDataFrames(data:pd.DataFrame, bib:pd.DataFrame, obs:pd.DataFrame) -> pd.DataFrame:
    """
    Combines the with the original data frame the columns
    for whether the objects are observed by astrosat
    and whether any publications referred them.
    """
    observed = []
    referred = []
    for _ in range(len(data)):
        if isObserved(data, obs, _):
            observed.append(True)
        else:
            observed.append(False)
        if isReferred(data, bib, _):
            referred.append(True)
        else:
            referred.append(False)
    df = data
    df["isObserved"]=observed
    df["isReferred"]=referred
    return df

In [None]:
obs = makeAstDataFrame(r"./data/AS_observations_cat_Sept2018.txt");
bib = makeBibDataFrame(r"./data/AS_publications2019-21.txt");