In [2]:
import io, os, re, logging
import requests, tqdm, tarfile, itertools, html, time
import numpy as np
import pandas as pd
import dask.dataframe as dd

import plotly.express as px
from dateparser import parse

pd.set_option('display.max_rows', 20)

# Load all public shot IDs by iterating through pages from database

In [4]:
# Load list of shots
pages_df = pd.read_csv("../data/raw/visualizercoffee_shot_ids_2021-12-18.csv")

pages_df

Unnamed: 0,clock,id
0,1636526780,14899484-a5f8-4ccc-b372-79700c8150ae
1,1636526752,76f936ee-5147-40ac-986e-f0f1be5ac97e
2,1636526738,8853ae7c-5bf8-4194-93dd-462de63f1471
3,1636526681,a4d7a358-fc3f-451c-92f5-756adaa4a7dd
4,1636526665,3ebcfff9-fb6f-47f6-a3df-affe01d55150
...,...,...
73213,1274388447,a68afc41-645d-46c6-bfce-3dc9bf95ed0b
73214,1274388290,a37b6160-3e6a-4860-97cc-6b75a035c0da
73215,1274387176,3849dd2f-1678-488b-8f76-b166f53fe7ed
73216,1274386903,5a5d218e-7d84-4a8c-9621-f3067b0b8fb5


In [5]:
# n_pages = 7321
data_pages = []

def get_page_json(page_num):
    response = requests.get(f'https://visualizer.coffee/api/shots?page={page_num}',)
    obj = response.json()
    return obj["data"]

for page_num in tqdm.tqdm(range(7321, 8631+1)):
    data = get_page_json(page_num)
    data_pages.extend(data)

new_pages_df = pd.DataFrame.from_dict(data_pages).drop_duplicates()

# Multi-worker 
# npartitions = 24
# pages = pd.Series(range(1589, n_pages+1))

# rest_df = dd.from_pandas(pages, npartitions=npartitions).apply(lambda pages: get_page_json(page), meta=list).compute()
pages_df = pd.concat([pages_df, new_pages_df], axis=0).drop_duplicates()
pages_df.to_csv("../data/raw/visualizercoffee_shot_ids_2021-12-18.csv", index=False)

In [7]:
def get_shot_json(shot_id):
    response = requests.get(f'https://visualizer.coffee/api/shots/{shot_id}/download',)
    obj = response.json()
    return obj

In [8]:
def json_to_series(data):
    if "error" in data: return False
    
    shot_df = pd.DataFrame({k:v for k,v in data.items() if k not in ["data", "timeframe"]}, 
                           index=pd.Index([data["id"]], name="id"))
    
    shot_df = pd.DataFrame.from_dict({k:v for k,v in data.items() if k not in ["data", "timeframe"]}, orient="index")[0]

    timeframe = pd.Index(data["timeframe"]+[data["timeframe"][-1]], name="timeframe").astype(float)
    shot_df["timeframe"] = timeframe.tolist()

    series_df = pd.DataFrame(data["data"]).astype(float)
    
    for key, values in series_df.to_dict(orient='list').items():
        shot_df[key] = np.array(values)

    return shot_df

In [12]:
shot_series = []

for shot_id in tqdm.tqdm(new_pages_df["id"]):
    data = get_shot_json(shot_id)
    
    try:
        shot_json = json_to_series(data)
        if not isinstance(shot_json, pd.Series): 
            continue
        shot_series.append(shot_json)
        
    except:
        logging.info(f"error for shot {shot_id}")

        
new_shots_df = pd.concat(shot_series, axis=1).T

100%|███████████████████████████████████| 13109/13109 [2:03:02<00:00,  1.78it/s]


In [22]:
new_pages_df["id"]

Unnamed: 0,clock,id
0,1618494554,65851316-b83e-4676-a072-c34b6b572a0b
1,1618494483,2c2bad25-d6f6-4058-a571-1c5ae925ebe9
2,1618494266,16d5f8e1-a087-45b8-a623-bdec4445e9a8
3,1618493724,3ddb33ba-1754-4bd2-b1b6-e1969d5ac240
4,1618493724,fe266d43-86f4-4e0e-8cda-243ef98eb395
...,...,...
13105,1274388447,a68afc41-645d-46c6-bfce-3dc9bf95ed0b
13106,1274388290,a37b6160-3e6a-4860-97cc-6b75a035c0da
13107,1274387176,3849dd2f-1678-488b-8f76-b166f53fe7ed
13108,1274386903,5a5d218e-7d84-4a8c-9621-f3067b0b8fb5


In [21]:
combined_shots_df = pd.concat([shots_df, new_shots_df], axis=0).drop_duplicates(subset=["id"])

combined_shots_df.shape, combined_shots_df["id"].nunique()

((73202, 36), 73202)

In [19]:
shots_df = pd.read_parquet("../data/raw/visualizercoffee_shot_series.parquet",)
shots_df.to_parquet("../data/raw/visualizercoffee_shot_series_2021-12-18.parquet", index=False)
shots_df.shape

72877

# Import raw downloaded data

In [18]:
shots_df = pd.read_parquet("../data/raw/visualizercoffee_shot_series.parquet",)
shots_df.shape

(72877, 34)

In [35]:
shots_df = combined_shots_df
shots_df.shape

(73202, 36)

# Clean data

## Datetime

In [36]:
shots_df["start_time"] = pd.DatetimeIndex(shots_df["start_time"])
shots_df["start_time"] = shots_df["start_time"].dt.tz_localize(None)

In [37]:
def parse_date(x):
    if not isinstance(x, str) or len(x) < 4:
        return None        
    try: 
        dt = parse(x)
        timestamp = pd.Timestamp(dt)
    except:
        return None
    
    return timestamp
shots_df["roast_date"] = shots_df["roast_date"].apply(parse_date)

  date_obj = stz.localize(date_obj)
  date_obj = tz.localize(date_obj)


In [38]:
shots_df["bean_age"] = shots_df["start_time"] - shots_df["roast_date"]

idx_A = shots_df["bean_age"] < pd.Timedelta(10, unit="minutes")
shots_df["bean_age"][idx_A] = None

idx_B = shots_df["bean_age"] > pd.Timedelta(200, unit="days")
shots_df["bean_age"][idx_B] = None

shots_df["bean_age"] = shots_df["bean_age"] / pd.to_timedelta(1, unit='D')

shots_df.loc[idx_A | idx_B]["roast_date"] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()



### Timeframe series data

In [39]:
# Only needed when timeframe not aligned

def shift_timeframe_step(time_index, values, id=None):
    time_index = np.roll(time_index, shift=-1)
    time_index[-1] = time_index[-2]
    
    if len(time_index) - len(values) == 1:
        time_index = time_index[:-1]
        
    if len(time_index) != len(values):
        time_index = time_index[:len(values)]
    
    return time_index

shots_df["timeframe"] = shots_df[["timeframe", "espresso_flow", "id"]].apply(
    lambda x: shift_timeframe_step(x.timeframe, x.espresso_flow, x.id), axis=1)

In [41]:
shots_df["shot_time"] = shots_df["timeframe"].apply(max)

## Numericals

In [42]:
shots_df["drink_tds"] = shots_df["drink_tds"].str.replace(r"[^0-9.]+", '').replace({"": None})
shots_df["drink_ey"] = shots_df["drink_ey"].str.replace(",", ".").str.replace(r"[^0-9.]+", '').replace({"": None, "20210113174207": None})

  """Entry point for launching an IPython kernel.
  


In [40]:
shots_df["bean_weight"] = shots_df["bean_weight"].str.replace(",", ".").str.extract(r"(\d*\.\d+|\d+)")[0]
shots_df["drink_weight"] = shots_df["drink_weight"].str.replace(",", ".").str.extract(r"(\d*\.\d+|\d+)")[0]

In [43]:
shots_df.loc[shots_df["espresso_enjoyment"] > 100, "espresso_enjoyment"] = None

In [67]:
shots_df["drink_weight"].astype(float, errors='ignore')

0     35.2
1     36.1
2     40.3
3    124.5
4      0.0
     ...  
0        0
0        0
0        0
0        0
0        0
Name: drink_weight, Length: 73202, dtype: object

## Caterogrical fields

In [44]:
shots_df["profile_title"] = shots_df["profile_title"].replace({"": None})

In [60]:
shots_df.astype({
    # "drink_tds": float,
    # "drink_ey": float,
    # "bean_weight": float,
    "drink_weight": float,
    # "espresso_enjoyment": float,
})

ValueError: could not convert string to float: 

In [51]:
shots_df.dtypes.to_dict()

{'id': dtype('O'),
 'profile_title': dtype('O'),
 'user_id': dtype('O'),
 'drink_tds': dtype('O'),
 'drink_ey': dtype('O'),
 'espresso_enjoyment': dtype('O'),
 'bean_weight': dtype('O'),
 'drink_weight': dtype('O'),
 'grinder_model': dtype('O'),
 'grinder_setting': dtype('O'),
 'bean_brand': dtype('O'),
 'bean_type': dtype('O'),
 'roast_date': dtype('<M8[ns]'),
 'espresso_notes': dtype('O'),
 'roast_level': dtype('O'),
 'bean_notes': dtype('O'),
 'start_time': dtype('<M8[ns]'),
 'image_preview': dtype('O'),
 'profile_url': dtype('O'),
 'timeframe': dtype('O'),
 'espresso_flow': dtype('O'),
 'espresso_weight': dtype('O'),
 'espresso_pressure': dtype('O'),
 'espresso_flow_goal': dtype('O'),
 'espresso_resistance': dtype('O'),
 'espresso_flow_weight': dtype('O'),
 'espresso_state_change': dtype('O'),
 'espresso_pressure_goal': dtype('O'),
 'espresso_flow_weight_raw': dtype('O'),
 'espresso_temperature_mix': dtype('O'),
 'espresso_water_dispensed': dtype('O'),
 'espresso_temperature_goal':

# Saved cleaned file

In [56]:
time.localtime().__str__()

'time.struct_time(tm_year=2021, tm_mon=12, tm_mday=18, tm_hour=23, tm_min=53, tm_sec=40, tm_wday=5, tm_yday=352, tm_isdst=0)'

In [45]:
shots_df.to_parquet(f"../data/raw/visualizercoffee_{shots_df.index.size}shots_2021-11-11_cleaned.parquet", index=False)

# Show shot data

In [43]:
shot_df["image_preview"].values

array(['https://visualizer-coffee.s3.eu-central-1.amazonaws.com/screenshots/14899484-a5f8-4ccc-b372-79700c8150ae.png'],
      dtype=object)