# Feature Engineering

In [None]:
import pandas as pd
import numpy as np

df_full = pd.read_csv('df_full.csv')

df_full['order_purchase_timestamp'] = pd.to_datetime(df_full['order_purchase_timestamp'])
last_purchase = df_full.groupby('customer_unique_id')['order_purchase_timestamp'].max().reset_index()
last_purchase['days_since_last_purchase'] = (df_full['order_purchase_timestamp'].max() - last_purchase['order_purchase_timestamp']).dt.days
last_purchase['churn'] = (last_purchase['days_since_last_purchase'] > 180).astype(int)

# Frekuensi order per customer
order_count = df_full.groupby('customer_unique_id')['order_id'].nunique().reset_index(name='order_count')
order_count.columns = ['customer_unique_id', 'order_count']

# Total Pembayaran per customer
total_payment = df_full.groupby('customer_unique_id')['payment_value'].sum().reset_index()
total_payment.columns = ['customer_unique_id', 'total_payment']

# Rata - rata score review per customer 
avg_review = df_full.groupby('customer_unique_id')['review_score'].mean().reset_index()
avg_review.columns = ['customer_unique_id', 'avg_review_score']

# Rata-rata waktu pengiriman per customer
df_full['order_delivered_customer_date'] = pd.to_datetime(df_full['order_delivered_customer_date'], errors='coerce')
df_full['order_approved_at'] = pd.to_datetime(df_full['order_approved_at'], errors='coerce')
df_full['delivery_days'] = (df_full['order_delivered_customer_date'] - df_full['order_approved_at']).dt.days
avg_delivery = df_full.groupby('customer_unique_id')['delivery_days'].mean().reset_index()
avg_delivery.columns = ['customer_unique_id', 'avg_delivery_days']

# Churn
df_full['order_purchase_timestamp'] = pd.to_datetime(df_full['order_purchase_timestamp'])
last_purchase = df_full.groupby('customer_unique_id')['order_purchase_timestamp'].max().reset_index()
last_purchase['days_since_last_purchase'] = (df_full['order_purchase_timestamp'].max() - last_purchase['order_purchase_timestamp']).dt.days
last_purchase['churn'] = (last_purchase['days_since_last_purchase'] > 180).astype(int)

# Menggabungkan semua fitur
features = order_count.merge(total_payment, on='customer_unique_id', how='left') \
                      .merge(avg_review, on='customer_unique_id', how='left') \
                      .merge(avg_delivery, on='customer_unique_id', how='left') \
                      .merge(last_purchase[['customer_unique_id', 'days_since_last_purchase', 'churn']], on='customer_unique_id', how='left')
                      
# Membersihkan data
features = features.dropna()
features = features[features['avg_delivery_days'] >= 0] # Membuang anomali

features.to_csv('../data/processed/churn_features.csv', index=False)
print('Dataset buat model disimpan')
print(features.head())

Dataset buat model disimpan
                 customer_unique_id  order_count  total_payment  \
0  0000366f3b9a7992bf8c76cfdf3221e2            1         141.90   
1  0000b849f77a49e4a4ce2b2a4ca5be3f            1          27.19   
2  0000f46a3911fa3c0805444483337064            1          86.22   
3  0000f6ccb0745a6a4b88665a16c9f078            1          43.62   
4  0004aac84e0df4da2b147fca70cf8255            1         196.89   

   avg_review_score  avg_delivery_days  days_since_last_purchase  churn  
0               5.0                6.0                       160      0  
1               4.0                2.0                       163      0  
2               3.0               25.0                       585      1  
3               4.0               20.0                       369      1  
4               5.0               13.0                       336      1  
