In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import joblib 

In [2]:
df = pd.read_csv("dataset/train.csv")
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,user_id,user_session,session_value
0,2025-06-19 10:23:07+00:00,ADD_CART,PROD_011223,CAT_00054,USER_097562,SESSION_158779,90.29
1,2025-06-07 21:34:45+00:00,ADD_CART,PROD_005519,CAT_00144,USER_006535,SESSION_029987,16.39
2,2025-06-21 21:29:09+00:00,ADD_CART,PROD_000577,CAT_00273,USER_047199,SESSION_022134,64.27
3,2025-06-09 09:10:20+00:00,ADD_CART,PROD_019235,CAT_00442,USER_082028,SESSION_161308,41.67
4,2025-06-19 11:13:58+00:00,ADD_CART,PROD_001702,CAT_00025,USER_096574,SESSION_182859,86.11


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 141218
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     141219 non-null  object 
 1   event_type     141219 non-null  object 
 2   product_id     141219 non-null  object 
 3   category_id    141219 non-null  object 
 4   user_id        141219 non-null  object 
 5   user_session   141219 non-null  object 
 6   session_value  141219 non-null  float64
dtypes: float64(1), object(6)
memory usage: 7.5+ MB


In [4]:
print(df['event_type'].nunique())
print(df["product_id"].nunique())
print(df["category_id"].nunique())
print(df["user_id"].nunique())
print(df["user_session"].nunique())

4
26470
448
51821
70736


In [5]:
df["event_time"] = pd.to_datetime(df["event_time"], utc=True)

df["hour"] = df["event_time"].dt.hour
df["day"] = df["event_time"].dt.day
df["weekday"] = df["event_time"].dt.weekday  # 0 = Pazartesi, 6 = Pazar
df["is_weekend"] = (df["weekday"] >= 5).astype(int) # Hafta sonu kontrolÃ¼

In [6]:
# --- 1) ÃœrÃ¼n ve kategori Ã§eÅŸitliliÄŸi + diÄŸer Ã¶zetler ---

# --- 1.1) Session bazlÄ± Ã¶zet ---
agg_features = df.groupby("user_session").agg(
    user_id=("user_id", "first"),          
    n_events=("event_type", "count"),      
    n_products=("product_id", "nunique"),  
    n_categories=("category_id", "nunique"),
    avg_hour=("hour", "mean"),             
    is_weekend=("is_weekend", "max"),      
    session_value=("session_value", "first"),
).reset_index()

# --- 1.2) YoÄŸunluk & tekrar oranlarÄ± ---
agg_features["product_repeat_rate"] = agg_features["n_events"] / (agg_features["n_products"] + 1)
agg_features["category_repeat_rate"] = agg_features["n_events"] / (agg_features["n_categories"] + 1)

# --- 1.3) Event frekanslarÄ± ---
event_counts = df.pivot_table(
    index="user_session",
    columns="event_type",
    values="event_time",
    aggfunc="count",
    fill_value=0
).reset_index()

agg_features = agg_features.merge(event_counts, on="user_session", how="left")

# --- 1.4) Event oranlarÄ± ---
agg_features["conversion_rate"] = agg_features["BUY"] / (agg_features["n_events"] + 1)
agg_features["add_cart_ratio"] = agg_features["ADD_CART"] / (agg_features["n_events"] + 1)
agg_features["remove_vs_add"] = agg_features["REMOVE_CART"] / (agg_features["ADD_CART"] + 1)
agg_features["view_to_buy_ratio"] = agg_features["BUY"] / (agg_features["VIEW"] + 1)
agg_features["buy_ratio"] = agg_features["BUY"] / (agg_features["VIEW"] + 1)

# --- 1.5) BUY pozisyonu ---

# 1.5.1) Event sÄ±rasÄ± hesapla
df["event_index"] = df.groupby("user_session").cumcount() + 1  # her sessionâ€™da event sÄ±rasÄ±

# 1.5.2) Session bazÄ±nda ilk BUY pozisyonunu bul
buy_pos = df[df["event_type"] == "BUY"].groupby("user_session").agg(
    buy_position=("event_index", "min")
).reset_index()

# 1.5.3) Test setine merge et
agg_features = agg_features.merge(buy_pos, on="user_session", how="left")

# 1.5.4) NormalleÅŸtirilmiÅŸ pozisyon
# NaN olanlar = BUY yok, 1.0 ile dolduruyoruz (en sona kadar hiÃ§ buy gelmedi)
agg_features["buy_position_norm"] = agg_features["buy_position"] / (agg_features["n_events"] + 1)
agg_features["buy_position_norm"] = agg_features["buy_position_norm"].fillna(1.0)

# 1.5.5) Binary flag ekle (BUY var mÄ± yok mu)
agg_features["has_buy"] = agg_features["buy_position"].notna().astype(int)

In [7]:
# --- 2) event_type frekanslarÄ±  ---

agg_features = agg_features.merge(event_counts, on="user_session", how="left")

drop_cols = [c for c in agg_features.columns if '_x' in c or '_y' in c]
agg_features = agg_features.drop(columns=drop_cols)

In [8]:
# --- 3) BirleÅŸtirme ---

train_session = agg_features.merge(event_counts, on="user_session", how="left", sort=False)

In [9]:
# --- 4) event_time sÃ¼tununu datetime'a Ã§evir ---

train_session["hour"] = df["event_time"].dt.hour
train_session["day"] = df["event_time"].dt.day
train_session["weekday"] = df["event_time"].dt.weekday  # 0 = Pazartesi, 6 = Pazar
train_session["is_weekend"] = (df["weekday"] >= 5).astype(int) # Hafta sonu kontrolÃ¼

In [10]:
train_session.head()

Unnamed: 0,user_session,user_id,n_events,n_products,n_categories,avg_hour,is_weekend,session_value,product_repeat_rate,category_repeat_rate,...,buy_position,buy_position_norm,has_buy,ADD_CART,BUY,REMOVE_CART,VIEW,hour,day,weekday
0,SESSION_000000,USER_096031,28,24,20,11.5,0,355.8,1.12,1.333333,...,,1.0,0,20,0,8,0,10,19,3
1,SESSION_000001,USER_023172,6,5,5,4.166667,1,96.6,1.0,1.0,...,6.0,0.857143,1,2,1,2,1,21,7,5
2,SESSION_000004,USER_065806,1,1,1,10.0,1,30.92,0.5,0.5,...,,1.0,0,0,0,0,1,21,21,5
3,SESSION_000005,USER_026492,1,1,1,13.0,0,40.09,0.5,0.5,...,,1.0,0,0,0,0,1,9,9,0
4,SESSION_000012,USER_045859,1,1,1,8.0,0,23.06,0.5,0.5,...,,1.0,0,0,0,0,1,11,19,3


In [11]:
train_session.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70736 entries, 0 to 70735
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_session          70736 non-null  object 
 1   user_id               70736 non-null  object 
 2   n_events              70736 non-null  int64  
 3   n_products            70736 non-null  int64  
 4   n_categories          70736 non-null  int64  
 5   avg_hour              70736 non-null  float64
 6   is_weekend            70736 non-null  int32  
 7   session_value         70736 non-null  float64
 8   product_repeat_rate   70736 non-null  float64
 9   category_repeat_rate  70736 non-null  float64
 10  conversion_rate       70736 non-null  float64
 11  add_cart_ratio        70736 non-null  float64
 12  remove_vs_add         70736 non-null  float64
 13  view_to_buy_ratio     70736 non-null  float64
 14  buy_ratio             70736 non-null  float64
 15  buy_position       

In [12]:
train_session.shape

(70736, 25)

In [13]:
train_session.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70736 entries, 0 to 70735
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_session          70736 non-null  object 
 1   user_id               70736 non-null  object 
 2   n_events              70736 non-null  int64  
 3   n_products            70736 non-null  int64  
 4   n_categories          70736 non-null  int64  
 5   avg_hour              70736 non-null  float64
 6   is_weekend            70736 non-null  int32  
 7   session_value         70736 non-null  float64
 8   product_repeat_rate   70736 non-null  float64
 9   category_repeat_rate  70736 non-null  float64
 10  conversion_rate       70736 non-null  float64
 11  add_cart_ratio        70736 non-null  float64
 12  remove_vs_add         70736 non-null  float64
 13  view_to_buy_ratio     70736 non-null  float64
 14  buy_ratio             70736 non-null  float64
 15  buy_position       

In [20]:
train_session = train_session.drop(columns=['buy_position'])

In [21]:
# Ã–zellikler (hedefi ve kimlik kolonlarÄ±nÄ± Ã§Ä±kartÄ±yoruz)
# TÃ¼m sayÄ±sal kolonlarÄ± seÃ§
feature_cols = train_session.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns.tolist()

# Hedef ve ID kolonlarÄ±nÄ± Ã§Ä±kar, ve NaN olanlarÄ± Ã§Ä±kar
feature_cols = [c for c in feature_cols if c not in ['session_value', 'user_id', 'buy_position']]
feature_cols

['n_events',
 'n_products',
 'n_categories',
 'avg_hour',
 'is_weekend',
 'product_repeat_rate',
 'category_repeat_rate',
 'conversion_rate',
 'add_cart_ratio',
 'remove_vs_add',
 'view_to_buy_ratio',
 'buy_ratio',
 'buy_position_norm',
 'has_buy',
 'ADD_CART',
 'BUY',
 'REMOVE_CART',
 'VIEW',
 'hour',
 'day',
 'weekday']

In [22]:
from sklearn.model_selection import train_test_split

# BÃ¼tÃ¼n session ID'lerini al
sessions = train_session['user_session'].unique()

train_sess, val_sess = train_test_split(sessions, test_size=0.1, random_state=42)

train_data = train_session[train_session['user_session'].isin(train_sess)]
val_data   = train_session[train_session['user_session'].isin(val_sess)]

X_train = train_data[feature_cols]
y_train = train_data['session_value']

X_val   = val_data[feature_cols]
y_val   = val_data['session_value']

In [23]:
X = train_session[feature_cols].values
y = train_session['session_value'].values
groups = train_session['user_session'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.save")

['scaler.save']

In [17]:
# --- KFold ayarÄ± ---

gkf = GroupKFold(n_splits=5)

fold = 1
val_scores = []

for train_idx, val_idx in gkf.split(X_scaled, y, groups=groups):
    print(f"\nðŸ”¹ Fold {fold} baÅŸlÄ±yor...")

    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model_ann = Sequential([
        Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
        BatchNormalization(),
        Dropout(0.4),

        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(1, activation='linear')  
    ])

    model_ann.compile(
        optimizer=Adam(),
        loss='mse',
        metrics=['mse']
    )

    history = model_ann.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=64,
    )

    val_loss, val_mse = model_ann.evaluate(X_val, y_val, verbose=0)
    print(f"Fold {fold} MSE: {val_mse:.4f}")
    val_scores.append(val_mse)

    fold += 1


ðŸ”¹ Fold 1 baÅŸlÄ±yor...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 MSE: 450.9627

ðŸ”¹ Fold 2 baÅŸlÄ±yor...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 2

In [18]:
avg_mse = np.mean(val_scores)
print(f"\nâœ… Ortalama MSE: {avg_mse:.4f}")


âœ… Ortalama MSE: 434.7378


In [19]:
model_ann.save("final_model.h5")