# feature engineering

In [6]:
import pandas as pd
import numpy as np
from pathlib import Path


In [7]:
# Пути к исходным данным
data_path = Path('data/raw')
orders = pd.read_csv(data_path / 'olist_orders_dataset.csv', parse_dates=['order_purchase_timestamp'])
order_items = pd.read_csv(data_path / 'olist_order_items_dataset.csv')
customers = pd.read_csv(data_path / 'olist_customers_dataset.csv')


In [8]:
# Объединение заказов с товарами
orders_full = orders.merge(order_items, on='order_id')
orders_full = orders_full.merge(customers[['customer_id', 'customer_unique_id']], on='customer_id')

# Добавляем дату анализа (последняя дата + 1)
analysis_date = orders_full['order_purchase_timestamp'].max() + pd.Timedelta(days=1)


In [9]:
# Агрегация по customer_unique_id
rfm = orders_full.groupby('customer_unique_id').agg({
    'order_purchase_timestamp': [
        lambda x: (analysis_date - x.max()).days,  # Recency
        'count',                                   # Frequency
        lambda x: x.sort_values().diff().dt.days.mean()  # avg_days_between_orders
    ],
    'price': 'sum'                                 # Monetary
})

rfm.columns = ['recency', 'frequency', 'avg_days_between_orders', 'monetary']
rfm = rfm.reset_index()


In [10]:
rfm

Unnamed: 0,customer_unique_id,recency,frequency,avg_days_between_orders,monetary
0,0000366f3b9a7992bf8c76cfdf3221e2,116,1,,129.90
1,0000b849f77a49e4a4ce2b2a4ca5be3f,119,1,,18.90
2,0000f46a3911fa3c0805444483337064,542,1,,69.00
3,0000f6ccb0745a6a4b88665a16c9f078,326,1,,25.99
4,0004aac84e0df4da2b147fca70cf8255,293,1,,180.00
...,...,...,...,...,...
95415,fffcf5a5ff07b0908bd4e2dbc735a684,452,2,0.0,1570.00
95416,fffea47cd6d3cc0a88bd621562a9d061,267,1,,64.89
95417,ffff371b4d645b6ecea244b27531430a,573,1,,89.90
95418,ffff5962728ec6157033ef9805bacc48,124,1,,115.00


In [11]:
# Средний чек
rfm['avg_order_value'] = rfm['monetary'] / rfm['frequency']


In [12]:
# LTV будем считать как monetary за весь период
rfm['ltv'] = rfm['monetary']

# Churn: если пользователь не покупал за последние 90 дней, считаем ушедшим
rfm['churn'] = (rfm['recency'] > 90).astype(int)


In [14]:
# Сохраняем
rfm.to_csv("data/processed/features.csv", index=False)
print("Features saved to 'data/processed/features.csv'")


Features saved to 'data/processed/features.csv'
