In [4]:

import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns 
import math 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys
sys.path.append('../Functions')
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.cluster import DBSCAN

In [11]:
data = 'DM2425_ABCDEats_DATASET.csv'
df = pd.read_csv(data)

In [18]:
fixed_columns = ['customer_id', 'customer_region', 'customer_age']

for i in range(7):  #DOW_0 a DOW_6
    selected_columns = fixed_columns + [f'DOW_{i}']
    cluster_df = df[selected_columns]
    print(cluster_df.head())

selected_columns = fixed_columns + [f'DOW_{i}']

cluster_df = df[selected_columns]

  customer_id customer_region  customer_age  DOW_0
0  1b8f824d5e            2360          18.0      1
1  5d272b9dcb            8670          17.0      1
2  f6d1b2ba63            4660          38.0      1
3  180c632ed8            4660           NaN      0
4  4eb37a6705            4660          20.0      0
  customer_id customer_region  customer_age  DOW_1
0  1b8f824d5e            2360          18.0      0
1  5d272b9dcb            8670          17.0      0
2  f6d1b2ba63            4660          38.0      0
3  180c632ed8            4660           NaN      1
4  4eb37a6705            4660          20.0      1
  customer_id customer_region  customer_age  DOW_2
0  1b8f824d5e            2360          18.0      0
1  5d272b9dcb            8670          17.0      0
2  f6d1b2ba63            4660          38.0      0
3  180c632ed8            4660           NaN      0
4  4eb37a6705            4660          20.0      0
  customer_id customer_region  customer_age  DOW_3
0  1b8f824d5e            2360  

In [20]:
# Agregar o total de pedidos por dia da semana
day_totals = df[['DOW_' + str(i) for i in range(7)]].sum()
print("Pedidos totais por dia da semana:")
print(day_totals)

# Analisar a distribuição de regiões
region_counts = df['customer_region'].value_counts()
print("Distribuição por regiões:")
print(region_counts)

# Identificar a idade média e a faixa etária predominante
median_age = df['customer_age'].median()
age_distribution = df['customer_age'].value_counts(bins=5, sort=False)
print(f"Idade mediana: {median_age}")
print("Distribuição de idades:")
print(age_distribution)


Pedidos totais por dia da semana:
DOW_0    17727
DOW_1    18096
DOW_2    18846
DOW_3    19753
DOW_4    21612
DOW_5    20822
DOW_6    22457
dtype: int64
Distribuição por regiões:
customer_region
8670    9761
4660    9550
2360    8829
2440    1483
4140     857
8370     495
2490     445
-        442
8550      26
Name: count, dtype: int64
Idade mediana: 26.0
Distribuição de idades:
(14.934000000000001, 28.0]    20303
(28.0, 41.0]                   9344
(41.0, 54.0]                   1327
(54.0, 67.0]                    150
(67.0, 80.0]                     37
Name: count, dtype: int64


In [21]:
weekend_orders = day_totals.loc['DOW_0'] + day_totals.loc['DOW_6']
total_orders = day_totals.sum()
weekend_percentage = (weekend_orders / total_orders) * 100
print(f"Pedidos no fim de semana: {weekend_percentage:.2f}% do total")

active_regions = region_counts.head(5)
print("Top 5 regiões mais ativas:")
print(active_regions)

most_frequent_ages = df['customer_age'].value_counts().head(5)
print("Idades mais frequentes:")
print(most_frequent_ages)


Pedidos no fim de semana: 28.84% do total
Top 5 regiões mais ativas:
customer_region
8670    9761
4660    9550
2360    8829
2440    1483
4140     857
Name: count, dtype: int64
Idades mais frequentes:
customer_age
23.0    2361
22.0    2318
24.0    2304
25.0    2262
26.0    2059
Name: count, dtype: int64


In [23]:
# Define metric and categorical features
metric_features = ["customer_age", "DOW_0", "DOW_1", "DOW_2", "DOW_3", "DOW_4", "DOW_5", "DOW_6"]
categorical_features = ["customer_region"]

# Check for missing values in the DataFrame (cluster_df is the relevant subset)
missing_values = cluster_df.isna().sum()

# Display missing values by column
print("Missing values per column:")
print(missing_values)

# Handle missing values if necessary
# Example: Fill missing values in the categorical column
if "customer_region" in categorical_features:
    print("Filling missing values in 'customer_region' with 'Unknown'")
    cluster_df["customer_region"].fillna("Unknown", inplace=True)

# Check for missing values again after handling
print("Missing values after handling:")
print(cluster_df.isna().sum())


Missing values per column:
customer_id          0
customer_region      0
customer_age       727
DOW_6                0
dtype: int64
Filling missing values in 'customer_region' with 'Unknown'
Missing values after handling:
customer_id          0
customer_region      0
customer_age       727
DOW_6                0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cluster_df["customer_region"].fillna("Unknown", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df["customer_region"].fillna("Unknown", inplace=True)


In [24]:
# Calculate the median age
median_age = cluster_df['customer_age'].median()

# Fill missing values with the median
cluster_df['customer_age'].fillna(median_age, inplace=True)

print(f"Filled missing values in 'customer_age' with median: {median_age}")

Filled missing values in 'customer_age' with median: 26.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cluster_df['customer_age'].fillna(median_age, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df['customer_age'].fillna(median_age, inplace=True)


In [30]:
scaler = MinMaxScaler()
cluster_df["customer_age"] = scaler.fit_transform(cluster_df[["customer_age"]])

cluster_df = df[["customer_id", "customer_region", "customer_age"] + [f"DOW_{i}" for i in range(7)]]
print(df.columns)


Index(['customer_id', 'customer_region', 'customer_age', 'vendor_count',
       'product_count', 'is_chain', 'first_order', 'last_order', 'last_promo',
       'payment_method', 'CUI_American', 'CUI_Asian', 'CUI_Beverages',
       'CUI_Cafe', 'CUI_Chicken Dishes', 'CUI_Chinese', 'CUI_Desserts',
       'CUI_Healthy', 'CUI_Indian', 'CUI_Italian', 'CUI_Japanese',
       'CUI_Noodle Dishes', 'CUI_OTHER', 'CUI_Street Food / Snacks',
       'CUI_Thai', 'DOW_0', 'DOW_1', 'DOW_2', 'DOW_3', 'DOW_4', 'DOW_5',
       'DOW_6', 'HR_0', 'HR_1', 'HR_2', 'HR_3', 'HR_4', 'HR_5', 'HR_6', 'HR_7',
       'HR_8', 'HR_9', 'HR_10', 'HR_11', 'HR_12', 'HR_13', 'HR_14', 'HR_15',
       'HR_16', 'HR_17', 'HR_18', 'HR_19', 'HR_20', 'HR_21', 'HR_22', 'HR_23'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster_df["customer_age"] = scaler.fit_transform(cluster_df[["customer_age"]])
