In [None]:
import pandas as pd

In [None]:
orders_df = pd.read_csv(
    "../data/orders.csv", sep=";", parse_dates=["confirmed_date", "concluded_date"]
)
marketing_df = pd.read_csv("../data/marketing.csv", sep=";")
sessions_df = pd.read_csv("../data/sessions.csv", sep=";")

# Orders - Data Types

In [None]:
orders_df.head(10)

In [None]:
orders_df.info()

# Which type of variables do we have in orders dataframe?

## Datetime

- confirmed_date
- concluded_date


## Boolean

- has_free_delivery
- scheduled


## Float

- delivery_fee
- total_amount
- subsidy_amount
- customer_long
- customer_lat
- merchant_long
- merchant_lat
- lag_last_order
- review_score


## Int

- nps_score
- merchant_zipcode


## Object

- order_id
- order_shift
- order_origin
- device_platform
- device_type
- device_app_version
- centroid_id (id da centroide dos clientes)
- state_label (UF do cliente)
- city (cidade do cliente)
- district (bairro do cliente)
- restaurant_id (id do restaurante)
- dish_type (tipo de cozinha)
- group_id (se o restaurante ta em algum grupo, id do grupo)
- merchant_centroid_id
- merchant_state
- merchant_city
- merchant_district
- account_id
- review_created_at

In [None]:
# variables type adjustment
orders_category_df = orders_df.select_dtypes(include="object").astype("category")
orders_category_df[["merchant_zipcode", "nps_score"]] = orders_df.select_dtypes(
    include="int"
).astype("category")
orders_float_df = orders_df.select_dtypes(include="float")
orders_date_df = orders_df[["confirmed_date", "concluded_date"]]
orders_bool_df = orders_df[["scheduled", "has_free_delivery"]].astype("boolean")

orders_treated_df = pd.concat(
    [orders_category_df, orders_float_df, orders_bool_df, orders_date_df],
    ignore_index=True,
)

In [None]:
orders_treated_df.info()

In [None]:
# saving dtypes as csv to make possible to others notebooks to use the correct types
dtypes_df = orders_treated_df.dtypes.to_frame().reset_index()
dtypes_df.columns = ["features", "dtypes"]
dtypes_df.to_csv("../data/orders-dtype.csv", index=False)

## Marketing - Data Types

To all others datasets we'll manually choose the correct data types

In [None]:
marketing_df.info()

In [None]:
dtypes_marketing = {
    "account_id": "category",
    "registration_date": "datetime64[ns, UTC]",
    "first_order_date": "datetime64[ns, UTC]",
    "ifood_status": "category",
    "ifood_status_last_month": "category",
    "total_order_count": "int64",
}

marketing_df.astype(dtypes_marketing).info()

## Session - Data Types

In [None]:
sessions_df.dtypes.to_frame().to_dict()[0]

In [None]:
session_dtypes = {
    "session_id": "category",
    "user_account_uuid": "category",
    "session_started": "category",
    "session_ended": "datetime64[ns, UTC]",
    "install_timestamp": "datetime64[ns, UTC]",
    "sum_view_restaurant_screen": "int64",
    "sum_view_dish_screen": "int64",
    "sum_click_add_item": "int64",
    "sum_view_checkout": "int64",
    "sum_callback_purchase": "int64",
    "first_order_has_voucher": "boolean",
    "media_campaign": "category",
    "load_time": "float64",
    "available_restaurants": "float64",
    "rownumber": "int64",
}

In [None]:
sessions_df = sessions_df.astype(session_dtypes)

In [None]:
sessions_df.info()

- delivery_fee: have negative values and some extreme values (5000 for a delivery fee is insane)
- total_amount: have some extreme values, but this variable is in monetary unit (UM). How to converto to real or dollar?
- subsidy_amount: does subisidy right? is this variable needed to came in negative values? Or the total value of the order be tootal_amount - subsidy_amount?
- longitude and latitude seems ok, also zipcode
- lag_last_order: some client took 1429 days to order somenthing on ifood, but in average they took from 2 to 3 days
- review in average of 4.66 and nps of 9.11.

From this variables, seems that we need to take a closer look on `delivery_fee`, `total_amount` and `subsidy_amount`, to understand this incosistences