In [1]:
from bs4 import BeautifulSoup
import collections

#BELOW CODE FROM: https://stackoverflow.com/questions/22604564/create-pandas-dataframe-from-a-string
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

import pandas as pd

bands = ["g", "r", "i", "z"]

In [2]:
def get_tf(txt, band):
    txt = txt.replace(' ', '\n')
    txt = band + " BJD," + band + " Flux\n" + txt

    #BELOW CODE FROM: https://stackoverflow.com/questions/22604564/create-pandas-dataframe-from-a-string
    TESTDATA = StringIO(txt)

    df = pd.read_csv(TESTDATA, sep=",")
    return df

def save_csv(candidate, txts):
    result = pd.concat([get_tf(txts[0])[0], get_tf(txts[1])[0]], axis=1)
    result = pd.concat([result, get_tf(txts[2])[0]], axis=1)
    
    result.to_csv("data/"+str(candidate)+".csv")

In [4]:
with open('htmls/DC21badcl.html', 'r') as file:  # r to open file in READ mode
    html = file.read()
soup = BeautifulSoup(html)

In [5]:
def find_ticks(soup):
    divs = soup.body.find_all('div',  attrs={'class':'svgplotdiv'})
    bands = ["g", "r", "i", "z"]
    dictionary = {"g":[], "r":[], "i":[], "z":[]}
    for i in range(len(divs)):
        dictionary[bands[i]] = find_a_tick(divs[i])
    
    return dictionary
#    return {"g": find_a_tick(divs[0]), "r": find_a_tick(divs[1]), "i": find_a_tick(divs[2]), "z": find_a_tick(divs[3])}

def find_a_tick(div):
    return BeautifulSoup(str(div), 'html.parser').find_all('text', attrs={'class':'svgplotticklabel'})

def find_scalings(band_ticks):
    # Find the scaling for the x axis
    xs = [str(band_ticks[i].attrs["x"]) for i in range(len(band_ticks))]
    x_common = [item for item, count in collections.Counter(xs).items() if count > 1]
    
    ys = [str(band_ticks[i].attrs["y"]) for i in range(len(band_ticks))]
    y_common = [item for item, count in collections.Counter(ys).items() if count > 1]
   
    x1 = 0
    xval1 = 0
    
    x2 = 0
    xval2 = 0
    
    y1 = 0
    yval1 = 0
    
    y2 = 0
    yval2 = 0
    
    for x in range(len(band_ticks)):
        if band_ticks[x].attrs["x"]==x_common[0]:
            x1 = float(band_ticks[x].attrs["y"])
            xval1 = float(band_ticks[x].string)
            
            x2 = float(band_ticks[x+1].attrs["y"])
            xval2 = float(band_ticks[x+1].string)
            break
    
        
    for x in range(len(band_ticks)):    
        if band_ticks[x].attrs["y"]==y_common[0]:
            y1 = float(band_ticks[x].attrs["x"])
            yval1 = float(band_ticks[x].string)
            
            y2 = float(band_ticks[x+1].attrs["x"])
            yval2 = float(band_ticks[x+1].string)
            break
                   
    xm = (xval1-xval2)/(x1-x2)
    xb = xval2 - xm*x2
    
    ym = (yval1-yval2)/(y1-y2)
    yb = yval2 - ym*y2
    
    return (ym, yb), (xm, xb) #It's actually flipped (fix this), so it would return x scale factors then y scale factors

In [6]:
#FIND DYS

def find_dys(soup):
    divs = soup.body.find_all('div',  attrs={'class':'svgplotdiv'})
    bands = ["g", "r", "i", "z"]
    dictionary = {"g":[], "r":[], "i":[], "z":[]}
    for i in range(len(divs)):
        dictionary[bands[i]] = find_a_dy(divs[i])
    
    return dictionary
    
#    return {"g": find_a_dy(divs[0]), "r": find_a_dy(divs[1]), "i": find_a_dy(divs[2]), "z": find_a_dy(divs[3])}

def find_a_dy(div):
    return BeautifulSoup(str(div), 'html.parser').find_all('line', attrs={'class':'errorbar0'})

def dy_to_np(dys):
    y1 = []
    y2 = []
    
    for i in range(len(dys)):
        y1.append(float(dys[i].attrs["y1"]))
        y2.append(float(dys[i].attrs["y2"]))
        
    return np.array(y1)-np.array(y2)

In [7]:
def get_dataset_strs(soup, band):
    ds = soup.body.find_all('polyline',  attrs={'class':'dataset0'})
    points = ds[bands.index(band)].attrs["points"]
    
    df = get_tf(points, band) 
    
    return df
    
def scale_data(ts, band, xscale, yscale):
    ts[band + " BJD"] = ts[band + " BJD"]*xscale[0] + xscale[1]
    ts[band + " Flux"] = ts[band + " Flux"]*yscale[0] + yscale[1]
    
    return ts


In [8]:
import numpy as np

In [10]:
def main(Soup):
    # print(soup.body)
    band_ticks = find_ticks(Soup)
    dys = find_dys(soup)
    band_ticks = {x:y for x,y in band_ticks.items() if y!=[]}
    dys = {x:y for x,y in dys.items() if y!=[]}
    
    df = pd.DataFrame()
    for i in range(len(dys)):
        xs, ys = find_scalings(band_ticks[bands[i]])
        # print(get_dataset_strs(soup, bands[i]))
        df_band = scale_data(get_dataset_strs(soup, bands[i]), bands[i], xs, ys)
        
        dy = dy_to_np(dys[bands[i]])
        # print(dy)
        # print(ys)
        dy = pd.DataFrame({bands[i] + " dy": abs(dy*ys[0])})
        # print(dy)
        df_band = pd.concat([df_band, dy], axis=1)
        df = pd.concat([df, df_band], axis=1)
        dic = find_dys(soup)
        dydf = pd.DataFrame(dy_to_np(dic[bands[i]]))
        

    df.to_csv("data/" + soup.find('input',  attrs={'id':'showcand_candid'}).attrs["value"] + ".csv")   
    return df

# main(soup)