# Feature engineering

In [2]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import os

# Data Load

In [3]:
# File path
file_path = r'C:\Users\win\Desktop\PROJETO\dados\df_clean.csv'

# Load the dataset
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,customer_id,customer_unique_id,customer_city,customer_state,order_id,order_purchase_timestamp,payment_sequential,payment_type,payment_installments,payment_value,review_score,price,freight_value,product_category_name,macro_categoria
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,franca,sp,00e7ee1b050b8499577073aeb2a297a1,2017-05-16 15:05:35,1,credit_card,2,146.87,4,124.99,21.88,moveis_escritorio,Móveis e Decoração
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,sao bernardo do campo,sp,29150127e6685892b6eab3eec79f59c7,2018-01-12 20:48:24,1,credit_card,8,335.48,5,289.0,46.48,utilidades_domesticas,Casa e Construção
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,sao paulo,sp,b2059ed67ce144a36e2aa97d2c9e9ad2,2018-05-19 16:07:45,1,credit_card,7,157.73,5,139.94,17.79,moveis_escritorio,Móveis e Decoração
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,mogi das cruzes,sp,951670f92359f4fe4a63112aa7306eba,2018-03-13 16:06:38,1,credit_card,1,173.3,5,149.94,23.36,moveis_escritorio,Móveis e Decoração
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,campinas,sp,6b7d50bd145f6fc7f33cebabd7e49d0f,2018-07-29 09:51:30,1,credit_card,8,252.25,5,230.0,22.25,casa_conforto,Móveis e Decoração


# Data Transformation

In [4]:
# Main aggregation
df_customer = df.groupby('customer_unique_id').agg(
    customer_city=('customer_city', 'first'),
    customer_state=('customer_state', 'first'),
    mean_payment_value=('payment_value', 'mean'),
    mean_freight_value=('freight_value', 'mean'),
    mean_price_value=('price', 'mean'),
    purchase_count=('order_id', 'count'),
    avg_payment_installments=('payment_installments', 'mean'),
    avg_review_score=('review_score', 'mean'),
    avg_payment_sequential=('payment_sequential', 'sum')
).reset_index()
  
# Round numerical averages for cleaner analysis
df_customer['avg_payment_installments'] = df_customer['avg_payment_installments'].round()
df_customer['avg_review_score'] = df_customer['avg_review_score'].round(1)
df_customer['avg_payment_sequential'] = df_customer['avg_payment_sequential'].round(2)

df_customer.head()

Unnamed: 0,customer_unique_id,customer_city,customer_state,mean_payment_value,mean_freight_value,mean_price_value,purchase_count,avg_payment_installments,avg_review_score,avg_payment_sequential
0,0000366f3b9a7992bf8c76cfdf3221e2,cajamar,sp,141.9,12.0,129.9,1,8.0,5.0,1
1,0000b849f77a49e4a4ce2b2a4ca5be3f,osasco,sp,27.19,8.29,18.9,1,1.0,4.0,1
2,0000f46a3911fa3c0805444483337064,sao jose,sc,86.22,17.22,69.0,1,8.0,3.0,1
3,0000f6ccb0745a6a4b88665a16c9f078,belem,pa,43.62,17.63,25.99,1,4.0,4.0,1
4,0004aac84e0df4da2b147fca70cf8255,sorocaba,sp,196.89,16.89,180.0,1,6.0,5.0,1


# Definition of the customer segmentation type

In [10]:
df_auxiliar = df.groupby('customer_unique_id').agg(
    total_payment_value=('payment_value', 'sum')
    
).reset_index()

df_macro = pd.get_dummies(df[['customer_unique_id', 'macro_categoria']], 
                          columns=['macro_categoria'], 
                          prefix='cat')

df_macro_encoded = df_macro.groupby('customer_unique_id').max().reset_index()

df_auxiliar = df_auxiliar.merge(df_macro_encoded, on='customer_unique_id', how='left')

df_auxiliar.fillna(0, inplace=True)

# Convert boolean to integer (0 and 1)
df_auxiliar[df_macro_encoded.columns.difference(['customer_unique_id'])] = \
    df_auxiliar[df_macro_encoded.columns.difference(['customer_unique_id'])].astype(int)

df_auxiliar.head()

Unnamed: 0,customer_unique_id,total_payment_value,cat_Alimentos e Bebidas,cat_Automotivo,cat_Casa e Construção,cat_Eletrodomésticos e Eletrônicos,cat_Indústria e Comércio,cat_Infantil,cat_Lazer e Entretenimento,cat_Moda e Acessórios,cat_Móveis e Decoração,cat_Outros,cat_Papelaria e Escritório,cat_Pet Shop,cat_Saúde e Beleza
0,0000366f3b9a7992bf8c76cfdf3221e2,141.9,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0000b849f77a49e4a4ce2b2a4ca5be3f,27.19,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0000f46a3911fa3c0805444483337064,86.22,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0000f6ccb0745a6a4b88665a16c9f078,43.62,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0004aac84e0df4da2b147fca70cf8255,196.89,0,0,0,1,0,0,0,0,0,0,0,0,0


# Defined through:
* **total payment value**
* **dominant macro category**

# If monetary - Log tranformation as shown in the EDA section

In [7]:
# Log-transform monetary column
monetary_cols = ['total_payment_value']

for col in monetary_cols:
    if col in df_auxiliar.columns:
        df_auxiliar[col] = np.log1p(df_auxiliar[col])
df_auxiliar.head()

Unnamed: 0,customer_unique_id,total_payment_value,cat_Alimentos e Bebidas,cat_Automotivo,cat_Casa e Construção,cat_Eletrodomésticos e Eletrônicos,cat_Indústria e Comércio,cat_Infantil,cat_Lazer e Entretenimento,cat_Moda e Acessórios,cat_Móveis e Decoração,cat_Outros,cat_Papelaria e Escritório,cat_Pet Shop,cat_Saúde e Beleza
0,0000366f3b9a7992bf8c76cfdf3221e2,4.962145,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0000b849f77a49e4a4ce2b2a4ca5be3f,3.338967,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0000f46a3911fa3c0805444483337064,4.468434,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0000f6ccb0745a6a4b88665a16c9f078,3.798182,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0004aac84e0df4da2b147fca70cf8255,5.287711,0,0,0,1,0,0,0,0,0,0,0,0,0


# Standard scale

In [8]:
# Standard Scale
cols_to_scale = df_auxiliar.select_dtypes(include=['number']).columns
data_to_scale = df_auxiliar[cols_to_scale]

# 2. Create an instance of the scaler and apply it.
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_to_scale)

# 3. Create a new DataFrame with the scaled data.
df_scaled = pd.DataFrame(scaled_data, columns=cols_to_scale)

# 4. Add the 'customer_unique_id' column back to the final DataFrame.
df_final = pd.concat([df_auxiliar['customer_unique_id'].reset_index(drop=True), df_scaled], axis=1)

# Display the result.
df_final.head()

Unnamed: 0,customer_unique_id,total_payment_value,cat_Alimentos e Bebidas,cat_Automotivo,cat_Casa e Construção,cat_Eletrodomésticos e Eletrônicos,cat_Indústria e Comércio,cat_Infantil,cat_Lazer e Entretenimento,cat_Moda e Acessórios,cat_Móveis e Decoração,cat_Outros,cat_Papelaria e Escritório,cat_Pet Shop,cat_Saúde e Beleza
0,0000366f3b9a7992bf8c76cfdf3221e2,0.159775,-0.1015,-0.205104,-0.366791,-0.466317,-0.066013,-0.274984,-0.329069,-0.320372,2.087344,-0.243453,-0.157021,-0.134238,-0.3752
1,0000b849f77a49e4a4ce2b2a4ca5be3f,-1.623352,-0.1015,-0.205104,-0.366791,-0.466317,-0.066013,-0.274984,-0.329069,-0.320372,-0.479078,-0.243453,-0.157021,-0.134238,2.665244
2,0000f46a3911fa3c0805444483337064,-0.382587,-0.1015,-0.205104,-0.366791,-0.466317,-0.066013,-0.274984,-0.329069,-0.320372,-0.479078,-0.243453,6.368579,-0.134238,-0.3752
3,0000f6ccb0745a6a4b88665a16c9f078,-1.118885,-0.1015,-0.205104,-0.366791,2.144463,-0.066013,-0.274984,-0.329069,-0.320372,-0.479078,-0.243453,-0.157021,-0.134238,-0.3752
4,0004aac84e0df4da2b147fca70cf8255,0.517423,-0.1015,-0.205104,-0.366791,2.144463,-0.066013,-0.274984,-0.329069,-0.320372,-0.479078,-0.243453,-0.157021,-0.134238,-0.3752


# Save df

In [9]:
# Path to the folder where the file will be saved
folder_path = r"C:\Users\win\Desktop\PROJETO\dados"

# File name
file_name = "df_final.csv"

# Join to create the full path
full_path = os.path.join(folder_path, file_name)

# Save the DataFrame to CSV format
df_final.to_csv(full_path, index=False, encoding='utf-8-sig')

print(f"✅ Dataset saved successfully at: {full_path}")

✅ Dataset saved successfully at: C:\Users\win\Desktop\PROJETO\dados\df_final.csv
