In [1]:
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv("dataset/train.csv")
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,user_id,user_session,session_value
0,2025-06-19 10:23:07+00:00,ADD_CART,PROD_011223,CAT_00054,USER_097562,SESSION_158779,90.29
1,2025-06-07 21:34:45+00:00,ADD_CART,PROD_005519,CAT_00144,USER_006535,SESSION_029987,16.39
2,2025-06-21 21:29:09+00:00,ADD_CART,PROD_000577,CAT_00273,USER_047199,SESSION_022134,64.27
3,2025-06-09 09:10:20+00:00,ADD_CART,PROD_019235,CAT_00442,USER_082028,SESSION_161308,41.67
4,2025-06-19 11:13:58+00:00,ADD_CART,PROD_001702,CAT_00025,USER_096574,SESSION_182859,86.11


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 141218
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     141219 non-null  object 
 1   event_type     141219 non-null  object 
 2   product_id     141219 non-null  object 
 3   category_id    141219 non-null  object 
 4   user_id        141219 non-null  object 
 5   user_session   141219 non-null  object 
 6   session_value  141219 non-null  float64
dtypes: float64(1), object(6)
memory usage: 7.5+ MB


In [4]:
print(df['event_type'].nunique())
print(df["product_id"].nunique())
print(df["category_id"].nunique())
print(df["user_id"].nunique())
print(df["user_session"].nunique())

4
26470
448
51821
70736


In [5]:
test_df = pd.read_csv("dataset/test.csv")
test_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,user_id,user_session
0,2025-06-28 10:09:58+00:00,ADD_CART,PROD_015000,CAT_00019,USER_109759,SESSION_164059
1,2025-06-25 11:57:50+00:00,ADD_CART,PROD_023887,CAT_00010,USER_010614,SESSION_109583
2,2025-06-30 14:34:20+00:00,ADD_CART,PROD_022673,CAT_00090,USER_041338,SESSION_171382
3,2025-06-30 22:12:18+00:00,ADD_CART,PROD_004664,CAT_00280,USER_015376,SESSION_137110
4,2025-06-26 16:55:18+00:00,ADD_CART,PROD_027815,CAT_00027,USER_054449,SESSION_146503


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62951 entries, 0 to 62950
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   event_time    62951 non-null  object
 1   event_type    62951 non-null  object
 2   product_id    62951 non-null  object
 3   category_id   62951 non-null  object
 4   user_id       62951 non-null  object
 5   user_session  62951 non-null  object
dtypes: object(6)
memory usage: 2.9+ MB


In [7]:
print(test_df['event_type'].nunique())
print(test_df["product_id"].nunique())
print(test_df["category_id"].nunique())
print(test_df["user_id"].nunique())
print(test_df["user_session"].nunique())

4
17450
433
22665
30789


In [8]:
df["event_time"] = pd.to_datetime(df["event_time"], utc=True)
test_df["event_time"] = pd.to_datetime(test_df["event_time"], utc=True)

df["hour"] = df["event_time"].dt.hour
df["day"] = df["event_time"].dt.day
df["weekday"] = df["event_time"].dt.weekday  # 0 = Pazartesi, 6 = Pazar
df["is_weekend"] = (df["weekday"] >= 5).astype(int) # Hafta sonu kontrolü

test_df["hour"] = test_df["event_time"].dt.hour
test_df["day"] = test_df["event_time"].dt.day
test_df["weekday"] = test_df["event_time"].dt.weekday  # 0 = Pazartesi, 6 = Pazar
test_df["is_weekend"] = (test_df["weekday"] >= 5).astype(int) # Hafta sonu kontrolü

In [9]:
# --- ) event_type frekansları (her session içinde kaç kez geçmiş) ---
event_counts = df.pivot_table(
    index="user_session", # her oturum (session) için satır oluşturur.
    columns="event_type", # her farklı event_type (VIEW, ADD_CART, vs.) ayrı sütun olur.
    values="event_time", # sayım yapılacak değer (zaman damgası, yani her satır bir event).
    aggfunc="count",  # her session için event sayısını sayar.
    fill_value=0 # olmayan event tipleri 0 olarak doldurulur.
).reset_index() # tabloyu normal DataFrame formatına döndürür.

event_counts_test = test_df.pivot_table(
    index="user_session", # her oturum (session) için satır oluşturur.
    columns="event_type", # her farklı event_type (VIEW, ADD_CART, vs.) ayrı sütun olur.
    values="event_time", # sayım yapılacak değer (zaman damgası, yani her satır bir event).
    aggfunc="count",  # her session için event sayısını sayar.
    fill_value=0 # olmayan event tipleri 0 olarak doldurulur.
).reset_index() # tabloyu normal DataFrame formatına döndürür.

In [10]:
# --- 2) Ürün ve kategori çeşitliliği + diğer özetler ---
agg_features = df.groupby("user_session").agg(
    user_id=("user_id", "first"),          # session'daki kullanıcı
    n_events=("event_type", "count"),      # toplam event sayısı
    n_products=("product_id", "nunique"),  # benzersiz ürün sayısı
    n_categories=("category_id", "nunique"),  # benzersiz kategori sayısı
    avg_hour=("hour", "mean"),             # ortalama saat
    is_weekend=("is_weekend", "max"),      # hafta sonu mu
    session_value=("session_value", "first")  # hedef değişken
).reset_index()

agg_features_test = test_df.groupby("user_session").agg(
    user_id=("user_id", "first"),          # session'daki kullanıcı
    n_events=("event_type", "count"),      # toplam event sayısı
    n_products=("product_id", "nunique"),  # benzersiz ürün sayısı
    n_categories=("category_id", "nunique"),  # benzersiz kategori sayısı
    avg_hour=("hour", "mean"),             # ortalama işlem saati
    is_weekend=("is_weekend", "max")       # hafta sonu mu
).reset_index()

In [11]:
# --- 3) Birleştirme ---
train_session = agg_features.merge(event_counts, on="user_session", how="left", sort=False)
test_session = agg_features_test.merge(event_counts_test, on="user_session", how="left", sort=False)

In [12]:
# --- 4) event_time sütununu datetime'a çevir ---
train_session["hour"] = df["event_time"].dt.hour
train_session["day"] = df["event_time"].dt.day
train_session["weekday"] = df["event_time"].dt.weekday  # 0 = Pazartesi, 6 = Pazar
train_session["is_weekend"] = (df["weekday"] >= 5).astype(int) # Hafta sonu kontrolü

test_session["hour"] = test_df["event_time"].dt.hour
test_session["day"] = test_df["event_time"].dt.day
test_session["weekday"] = test_df["event_time"].dt.weekday  # 0 = Pazartesi, 6 = Pazar
test_session["is_weekend"] = (test_df["event_time"].dt.weekday >= 5).astype(int) # Hafta sonu kontrolü

In [13]:
# Train için
session_time = df.groupby("user_session")["event_time"].agg(lambda x: (x.max() - x.min()).total_seconds() / 3600).reset_index().rename(columns={"event_time": "session_duration"})
train_session = train_session.merge(session_time, on="user_session", how="left")

# Test için
session_time_test = test_df.groupby("user_session")["event_time"].agg(lambda x: (x.max() - x.min()).total_seconds() / 3600).reset_index().rename(columns={"event_time": "session_duration"})
test_session = test_session.merge(session_time_test, on="user_session", how="left")

In [14]:
train_session.head()

Unnamed: 0,user_session,user_id,n_events,n_products,n_categories,avg_hour,is_weekend,session_value,ADD_CART,BUY,REMOVE_CART,VIEW,hour,day,weekday,session_duration
0,SESSION_000000,USER_096031,28,24,20,11.5,0,355.8,20,0,8,0,10,19,3,437.046389
1,SESSION_000001,USER_023172,6,5,5,4.166667,1,96.6,2,1,2,1,21,7,5,2.171667
2,SESSION_000004,USER_065806,1,1,1,10.0,1,30.92,0,0,0,1,21,21,5,0.0
3,SESSION_000005,USER_026492,1,1,1,13.0,0,40.09,0,0,0,1,9,9,0,0.0
4,SESSION_000012,USER_045859,1,1,1,8.0,0,23.06,0,0,0,1,11,19,3,0.0


In [15]:
test_session.head()

Unnamed: 0,user_session,user_id,n_events,n_products,n_categories,avg_hour,is_weekend,ADD_CART,BUY,REMOVE_CART,VIEW,hour,day,weekday,session_duration
0,SESSION_000000,USER_027389,28,27,19,12.035714,1,23,0,5,0,10,28,5,201.055278
1,SESSION_000013,USER_096685,2,2,2,19.0,0,0,0,0,2,11,25,2,0.062778
2,SESSION_000022,USER_004363,1,1,1,15.0,0,1,0,0,0,14,30,0,0.0
3,SESSION_000024,USER_041338,1,1,1,15.0,0,0,0,0,1,22,30,0,0.0
4,SESSION_000025,USER_052581,3,3,2,12.333333,0,1,0,2,0,16,26,3,0.501389


In [16]:
test_session.shape

(30789, 15)

In [17]:
from sklearn.model_selection import train_test_split

# Özellikler (hedefi ve kimlik kolonlarını çıkartıyoruz)
feature_cols = ['n_events', 'n_products', 'n_categories', 'avg_hour', 'hour', 'day', 'weekday', 'is_weekend', 'session_duration',
                'ADD_CART', 'BUY', 'REMOVE_CART', 'VIEW']

X = train_session[feature_cols]
y = train_session['session_value']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [18]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=25,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)

print(f"Validation MSE: {mse:.4f}")
print(f"Validation R^2: {r2_score(y_val, y_pred):.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 739
[LightGBM] [Info] Number of data points in the train set: 63662, number of used features: 13
[LightGBM] [Info] Start training from score 42.202568
Validation MSE: 331.0521
Validation R^2: 0.8372


In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)

In [20]:
model_ann = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(1, activation='linear')  
])

model_ann.compile(
    optimizer=Adam(),
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae']
)

In [21]:
history = model_ann.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=256,
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [25]:
y_pred = model_ann.predict(X_val_scaled)

from sklearn.metrics import r2_score
import numpy as np

rmse = np.sqrt(np.mean((y_val - y_pred.flatten())**2))
r2   = r2_score(y_val, y_pred)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation R^2: {r2:.4f}")


Validation RMSE: 16.6259
Validation R^2: 0.8641


In [26]:
X_test = test_session[feature_cols]

test_pred = model.predict(X_test)

submission_df = pd.DataFrame({
    'user_session': test_session['user_session'],
    'session_value': test_pred
})

In [28]:
X_test_scaled = scaler.transform(X_test)

test_pred_ann = model_ann.predict(X_test_scaled)

submission_df_ann = pd.DataFrame({
    'user_session': test_session['user_session'],
    'session_value': test_pred_ann.flatten()
})



In [29]:
original_order = test_df[["user_session"]].drop_duplicates()
submission_df = original_order.merge(submission_df, on="user_session", how="left")

submission_df.to_csv("sample_submission.csv", index=False)
print("sample_submission.csv dosyası kaydedildi ✅")

sample_submission.csv dosyası kaydedildi ✅


In [30]:
original_order = test_df[["user_session"]].drop_duplicates()
submission_df = original_order.merge(submission_df_ann, on="user_session", how="left")

submission_df.to_csv("sample_submission_ann.csv", index=False)
print("sample_submission_ann.csv dosyası kaydedildi ✅")

sample_submission_ann.csv dosyası kaydedildi ✅
