In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# graph config
sns.set()
mpl.rcParams["figure.figsize"] = (16, 8)
plt.style.use("ggplot")

In [None]:
orders_df = pd.read_csv("../data/orders.csv", sep=";")
orders_dtypes = pd.read_csv("../data/orders-dtype.csv")
orders_dtypes.set_index(["features"], inplace=True)
dtypes_to_read = orders_dtypes.to_dict()["dtypes"]
marketing_df = pd.read_csv("../data/marketing.csv", sep=";")
session_df = pd.read_csv("../data/sessions.csv", sep=";")

# dtypes orders fixed
orders_df = orders_df.astype(dtypes_to_read)

# dtypes marketing fixed
dtypes_marketing = {
    "account_id": "category",
    "registration_date": "datetime64[ns, UTC]",
    "first_order_date": "datetime64[ns, UTC]",
    "ifood_status": "category",
    "ifood_status_last_month": "category",
    "total_order_count": "int64",
}
marketing_df = marketing_df.astype(dtypes_marketing)

# dtypes sessions
session_dtypes = {
    "session_id": "category",
    "user_account_uuid": "category",
    "session_started": "category",
    "session_ended": "datetime64[ns, UTC]",
    "install_timestamp": "datetime64[ns, UTC]",
    "sum_view_restaurant_screen": "int64",
    "sum_view_dish_screen": "int64",
    "sum_click_add_item": "int64",
    "sum_view_checkout": "int64",
    "sum_callback_purchase": "int64",
    "first_order_has_voucher": "boolean",
    "media_campaign": "category",
    "load_time": "float64",
    "available_restaurants": "float64",
    "rownumber": "int64",
}

session_df = session_df.astype(session_dtypes)

# Orders

In [None]:
orders_df.describe(
    percentiles=[0.1, 0.25, 0.5, 0.75, 0.9, 0.99],
    exclude=["boolean", "category"],
    datetime_is_numeric=True,
).T

- delivery_fee: have negative values and some extreme values (5000 for a delivery fee is insane)
- total_amount: have some extreme values, but this variable is in monetary unit (UM). How to converto to real or dollar?
- subsidy_amount: does subisidy right? is this variable needed to came in negative values? Or the total value of the order be tootal_amount - subsidy_amount?
    - the max value on this variable is also too high and inside of 1% of the population.
- longitude and latitude seems ok, also zipcode
- The data area from 9 to 10 months on the 2020 year: Jan to Set or Out.
- lag_last_order: some client took 1429 days to order somenthing on ifood, but in average they took from 2 to 3 days
- review in average of 4.66 and nps of 9.11.

From this variables, seems that we need to take a closer look on `delivery_fee`, `total_amount` and `subsidy_amount`, to understand this incosistences



In [None]:
orders_numeric_types_df = orders_df.select_dtypes(exclude=["category", "boolean"])
columns = orders_numeric_types_df.columns

for column in columns:
    _ = sns.histplot(orders_numeric_types_df, x=column)
    plt.show()

In [None]:
_ = sns.ecdfplot(data=orders_numeric_types_df, x="lag_last_order")
plt.show()

In [None]:
orders_numeric_types_df["lag_last_order"].value_counts()

In [None]:
orders_numeric_types_df["delivery_fee"].value_counts()

In [None]:
orders_df[["state_label"]].value_counts()

In [None]:
pd.crosstab(orders_df["total_amount"], orders_df["state_label"])

In [None]:
orders_df.groupby(["state_label", "district"])["total_amount"].agg(
    np.mean
).reset_index()

In [None]:
orders_df.describe(include=["category", "boolean"]).T

In [None]:
orders_df.value_counts(["account_id"]).reset_index()

In [None]:
marketing_df[marketing_df["account_id"] == "d300ac6b-3fd5-4f24-b80e-a2a0066f925c"]

One order may have different numbers of items?

In [None]:
orders_df[["account_id", "lag_last_order"]].sort_values(["account_id"])

In [None]:
orders_df.loc[orders_df["centroid_id"] == "-23.6-46.73", ["state_label", "district"]]

In [None]:
orders_df.info()

In [None]:
orders_df["account_id"].cat.categories

In [None]:
orders_df.shape

- delivery_fee: have negative values and some extreme values (5000 for a delivery fee is insane)
- total_amount: have some extreme values, but this variable is in monetary unit (UM). How to converto to real or dollar?
- subsidy_amount: does subisidy right? is this variable needed to came in negative values? Or the total value of the order be tootal_amount - subsidy_amount?
- longitude and latitude seems ok, also zipcode
- lag_last_order: some client took 1429 days to order somenthing on ifood, but in average they took from 2 to 3 days
- review in average of 4.66 and nps of 9.11.

From this variables, seems that we need to take a closer look on `delivery_fee`, `total_amount` and `subsidy_amount`, to understand this incosistences

