In [1]:
import pandas as pd
from utils import load_raw_data
import json

In [2]:
FID = 'fid'
FIELD_ID = 'field_id'
TRACE_TYPE = 'trace_type'
IS_XSRC = 'is_xsrc'
IS_YSRC = 'is_ysrc'
DATA = 'data'

In [3]:
def get_trace_type(trace):
    ttype = trace.get('type')
    if ttype:
        if ttype == 'scatter':
            if trace.get('mode') in ['lines+markers', 'lines']:
                ttype = 'line'
            elif trace.get('line') and len(trace.get('line').keys()) > 0:
                ttype = 'line'
            elif trace.get('marker') and trace.get('marker').get(
                'line') and trace.get('marker').get('line').get('color') != 'transparent':
                ttype = 'line'
        return ttype
    return None

def get_src_uid(src):
    return src.split(':')[2]  

In [5]:
raw_data = load_raw_data()
plots_by_user = 0
users = 0
output_file_name = './traces_data.tsv'

for i, chunk in enumerate(raw_data):
    chunk_traces = []

    for chart_num, chart_obj in chunk.iterrows():
        fid = chart_obj.fid
        clean_fid = fid.split(':')[0]
       
        # Extract columns data from the dataset

        data = json.loads(chart_obj.table_data)
        columns = list(data.popitem()[1]['cols'].values())

        columns_info = {}
        for column in columns:
            uid = column['uid']
            data = column['data']
            columns_info[uid] = { 
                FID : fid, 
                FIELD_ID : f'{clean_fid}:{uid}',
                TRACE_TYPE : None,
                IS_XSRC : False,
                IS_YSRC : False, 
                DATA : data 
            }
        
        del columns, data # save some memory, I think :)

        # Extract columns outputs

        specification = json.loads(chart_obj.chart_data)

        try:
            for trace in specification:
                ttype = get_trace_type(trace)
                xsrc = trace.get('xsrc')
                ysrc = trace.get('ysrc')
                if xsrc:
                    try:
                        xsrc = get_src_uid(xsrc)
                        columns_info[xsrc][IS_XSRC] = True
                        columns_info[xsrc][TRACE_TYPE] = ttype
                    except KeyError:
                        pass
                        #print(f'column not found {xsrc}')
                if ysrc:
                    try:
                        ysrc = get_src_uid(ysrc)
                        columns_info[ysrc][IS_YSRC] = True
                        columns_info[ysrc][TRACE_TYPE] = ttype
                    except KeyError:
                        pass
                        #print(f'column not found {ysrc}')
        except:
            continue
        finally:
            del specification
        chunk_traces.extend(list(columns_info.values()))
        
    df = pd.DataFrame(chunk_traces, columns=[FID, FIELD_ID, TRACE_TYPE, IS_XSRC, IS_YSRC,DATA])
    df.to_csv(output_file_name, mode='a', index=False, header=(i == 0), sep='\t')

Loading raw data from ../data/plot_data.tsv


In [43]:
df = pd.read_csv('./traces_data.tsv', sep='\t')
df.head()

Unnamed: 0,fid,field_id,trace_type,is_xsrc,is_ysrc,data
0,xiemei:82,xiemei:8168a8,line,False,True,"['.299', '.279', '.259', '.239', '.219', '.199..."
1,xiemei:82,xiemei:30eee6,line,True,False,"['15', '14', '13', '12', '11', '10', '8', '6',..."
2,fldwhanata:0,fldwhanata:659029,scatter,False,True,"[-0.2611915054, -0.4479009152, -1.0383426805, ..."
3,fldwhanata:0,fldwhanata:e1c179,scatter,True,False,"[0.6596783582, -0.286348711, 1.3494841118, 0.8..."
4,mglab035:1,mglab035:75a58b,scatter,False,True,"['Cold', 'Room Temperature', 'Hot']"
