In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import joblib

# 1. upload 

df = pd.read_csv('C:/Users/mehme/OneDrive/Desktop/w1991607_MehmetSezer_E.C/ecom/store/data/2019-Nov.csv', nrows=1_000_000, low_memory=False)
df.dropna(inplace=True)
df['event_time'] = pd.to_datetime(df['event_time'])

# 2. just sale
purchase_df = df[df['event_type'] == 'purchase']

# 3. RFM
snapshot_date = purchase_df['event_time'].max() + pd.Timedelta(days=1)

rfm = purchase_df.groupby('user_id').agg({
    'event_time': lambda x: (snapshot_date - x.max()).days,  # Recency
    'user_id': 'count',  # Frequency
    'price': 'sum'       # Monetary
}).rename(columns={
    'event_time': 'recency',
    'user_id': 'frequency',
    'price': 'monetary'
})

# 4. scale
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

# 5. KMeans model segment
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['segment'] = kmeans.fit_predict(rfm_scaled)

# 6. saving the models
joblib.dump(kmeans, 'models/rfm_kmeans.pkl')
joblib.dump(scaler, 'models/rfm_scaler.pkl')

# 7. output
print(rfm.groupby('segment').mean())
print("RFM segmentation model successfully trained and saved.")


         recency  frequency     monetary
segment                                 
0            1.0   1.075013   244.607586
1            1.0   5.153846  2690.118907
2            1.0  12.736842  9316.373158
3            1.0   1.717402  1100.914937
RFM segmentation model successfully trained and saved.
