In [1]:
import pandas as pd 
import numpy as np 

In [2]:
import pandas as pd
import numpy as np

def read_csv_32bit(filename):
    """
    Reads a CSV and immediately converts:
    - float64 -> float32
    - int64   -> int32
    """
    # 1. Read the file (Pandas defaults to 64-bit)
    df = pd.read_csv(filename)
    # 2. Downcast Floats
    floats = df.select_dtypes(include=['float64']).columns
    df[floats] = df[floats].astype('float32')
    # 3. Downcast Integers
    # Safe check: Only convert if numbers fit in int32 (approx +/- 2 billion)
    ints = df.select_dtypes(include=['int64']).columns
    df[ints] = df[ints].astype('int32')
    print(f"Loaded {filename} | Memory: {df.memory_usage().sum() / 1024**2:.2f} MB")
    return df
# --- Load all your dataframes ---
print("⏳ Loading dataframes with optimized memory...")
train_truth = read_csv_32bit("/kaggle/input/aftertrainig/train_truth.csv")
main_training_data = read_csv_32bit("/kaggle/input/aftertrainig/main_training_data.csv")
user_product_features = read_csv_32bit("/kaggle/input/aftertrainig/user_product_features.csv")
product_features = read_csv_32bit("/kaggle/input/aftertrainig/product_features.csv")
user_features = read_csv_32bit("/kaggle/input/aftertrainig/user_features.csv")
df_od = read_csv_32bit("/kaggle/input/aftertrainig/orders_eda.csv")
df_opp = read_csv_32bit("/kaggle/input/aftertrainig/order_products_prior_eda.csv")
df_opt = read_csv_32bit("/kaggle/input/aftertrainig/order_products_train_eda.csv")
products_enriched= read_csv_32bit("/kaggle/input/aftertrainig/products_enriched.csv")
print("✅ All files loaded and optimized!")

⏳ Loading dataframes with optimized memory...
Loaded /kaggle/input/aftertrainig/train_truth.csv | Memory: 15.85 MB
Loaded /kaggle/input/aftertrainig/main_training_data.csv | Memory: 1015.32 MB
Loaded /kaggle/input/aftertrainig/user_product_features.csv | Memory: 456.89 MB
Loaded /kaggle/input/aftertrainig/product_features.csv | Memory: 1.14 MB
Loaded /kaggle/input/aftertrainig/user_features.csv | Memory: 4.72 MB
Loaded /kaggle/input/aftertrainig/orders_eda.csv | Memory: 104.40 MB
Loaded /kaggle/input/aftertrainig/order_products_prior_eda.csv | Memory: 494.91 MB
Loaded /kaggle/input/aftertrainig/order_products_train_eda.csv | Memory: 21.13 MB
Loaded /kaggle/input/aftertrainig/products_enriched.csv | Memory: 1.71 MB
✅ All files loaded and optimized!


In [3]:
df_od_opp = (
    df_od
    .merge(df_opp, on="order_id", how="left")
    .merge(products_enriched, on="product_id", how="left")
)

df_prior = df_od_opp[df_od_opp["eval_set"] == "prior"]


In [4]:
df_prior[["product_id", "aisle_id", "department_id"]].isnull().mean()


product_id       0.0
aisle_id         0.0
department_id    0.0
dtype: float64

FEATURE BLOCK 1: RECENCY-BASED USER–PRODUCT FEATURES

What problem this solves

buying a product 2 years ago = same weight as yesterday 

Reality:

recent behavior matters more

Compute last order number per user

what is “now” for each user (relative to history)

In [5]:
user_last_order = (
    df_prior
    .groupby("user_id")["order_number"]
    .max()
    .reset_index(name="user_last_order")
)


Compute last time user bought each product

In [6]:
up_last_order = (
    df_prior
    .groupby(["user_id", "product_id"])["order_number"]
    .max()
    .reset_index(name="up_last_order")
)


Merge & compute recency

In [7]:
up_recency = (
    up_last_order
    .merge(user_last_order, on="user_id", how="left")
)

up_recency["up_recency"] = (
    up_recency["user_last_order"] -
    up_recency["up_last_order"]
)


Exponential decay  

In [8]:
import numpy as np

alpha = 0.5  # decay strength (tunable later)

up_recency["up_recency_score"] = np.exp(
    -alpha * up_recency["up_recency"]
)


In [9]:
up_recency_features = up_recency[
    ["user_id", "product_id", "up_recency", "up_recency_score"]
]


In [10]:
main_training_data = main_training_data.merge(
    up_recency_features,
    on=["user_id", "product_id"],
    how="left"
)


Some (user, product) pairs may not appear (rare)

In [11]:
main_training_data[["up_recency", "up_recency_score"]] = (
    main_training_data[["up_recency", "up_recency_score"]]
    .fillna(main_training_data["up_recency"].max())
)


In [12]:
main_training_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13307953 entries, 0 to 13307952
Data columns (total 22 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   user_id                       int32  
 1   product_id                    float32
 2   user_total_orders             int32  
 3   user_total_products           int32  
 4   user_avg_basket_size          float32
 5   user_reorder_ratio            float32
 6   user_avg_days_between_orders  float32
 7   prod_total_purchases          int32  
 8   prod_unique_users             int32  
 9   prod_reorder_rate             float32
 10  prod_avg_cart_position        float32
 11  prod_order_count              int32  
 12  up_times_bought               int32  
 13  up_first_order_number         int32  
 14  up_last_order_number          int32  
 15  up_order_rate                 float32
 16  up_reorder_ratio              float32
 17  up_avg_cart_position          float32
 18  up_orders_since_last

BLOCK 2 — LAST-N ORDERS FEATURES (VERY STRONG)

People repeat recent habits

“Has the user bought this product recently?”

up_last_3_orders

up_last_5_orders

Binary but powerful.

In [13]:
# get each user's last order number
user_last_order = (
    df_prior
    .groupby("user_id")["order_number"]
    .max()
    .reset_index(name="user_last_order")
)

df_tmp = df_prior.merge(user_last_order, on="user_id", how="left")

df_tmp["orders_ago"] = (
    df_tmp["user_last_order"] - df_tmp["order_number"]
)

# flags for last-N orders
df_tmp["in_last_3"] = (df_tmp["orders_ago"] <= 3).astype(int)
df_tmp["in_last_5"] = (df_tmp["orders_ago"] <= 5).astype(int)

up_last_n = (
    df_tmp
    .groupby(["user_id", "product_id"])
    .agg(
        up_last_3_orders=("in_last_3", "max"),
        up_last_5_orders=("in_last_5", "max"),
    )
    .reset_index()
)


BLOCK 3 — NORMALIZED RATIO FEATURES (CRITICAL)

Relative importance matters.

We answer:

“Is this product special for this user?”

up_user_order_ratio

up_product_popularity_ratio

In [14]:
# user total orders
user_total_orders = (
    df_prior
    .groupby("user_id")["order_number"]
    .max()
    .reset_index(name="user_total_orders")
)

# product total purchases
prod_total_purchases = (
    df_prior
    .groupby("product_id")
    .size()
    .reset_index(name="prod_total_purchases")
)

# user-product times bought
up_times = (
    df_prior
    .groupby(["user_id", "product_id"])
    .size()
    .reset_index(name="up_times_bought")
)

up_ratios = (
    up_times
    .merge(user_total_orders, on="user_id", how="left")
    .merge(prod_total_purchases, on="product_id", how="left")
)

up_ratios["up_user_order_ratio"] = (
    up_ratios["up_times_bought"] /
    up_ratios["user_total_orders"]
)

up_ratios["up_product_popularity_ratio"] = (
    up_ratios["up_times_bought"] /
    up_ratios["prod_total_purchases"]
)

up_ratios = up_ratios[
    [
        "user_id",
        "product_id",
        "up_user_order_ratio",
        "up_product_popularity_ratio",
    ]
]


BLOCK 4 — USER–AISLE / DEPARTMENT PREFERENCES
Even if the user hasn’t bought this product recently,
they might love its aisle or department.

user_aisle_reorder_ratio

user_department_reorder_ratio

In [15]:
# user-aisle behavior
user_aisle = (
    df_prior
    .groupby(["user_id", "aisle_id"])
    .agg(
        user_aisle_orders=("order_id", "nunique"),
        user_aisle_reorder_ratio=("reordered", "mean"),
    )
    .reset_index()
)

# user-department behavior
user_department = (
    df_prior
    .groupby(["user_id", "department_id"])
    .agg(
        user_department_orders=("order_id", "nunique"),
        user_department_reorder_ratio=("reordered", "mean"),
    )
    .reset_index()
)



BLOCK 5 — CONSISTENCY / GAP FEATURES

Habitual products have regular gaps.

up_avg_order_gap

In [16]:
df_sorted = df_prior.sort_values(
    ["user_id", "product_id", "order_number"]
)

df_sorted["prev_order"] = (
    df_sorted
    .groupby(["user_id", "product_id"])["order_number"]
    .shift(1)
)

df_sorted["order_gap"] = (
    df_sorted["order_number"] - df_sorted["prev_order"]
)

up_gap = (
    df_sorted
    .groupby(["user_id", "product_id"])["order_gap"]
    .mean()
    .reset_index(name="up_avg_order_gap")
)


In [17]:
main_training_data = (
    main_training_data
    .merge(up_last_n, on=["user_id", "product_id"], how="left")
    .merge(up_ratios, on=["user_id", "product_id"], how="left")
    .merge(up_recency_features, on=["user_id", "product_id"], how="left")
    .merge(up_gap, on=["user_id", "product_id"], how="left")
)


In [18]:
main_training_data = main_training_data.fillna(0)


In [19]:
main_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13307953 entries, 0 to 13307952
Data columns (total 29 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   user_id                       int32  
 1   product_id                    float32
 2   user_total_orders             int32  
 3   user_total_products           int32  
 4   user_avg_basket_size          float32
 5   user_reorder_ratio            float32
 6   user_avg_days_between_orders  float32
 7   prod_total_purchases          int32  
 8   prod_unique_users             int32  
 9   prod_reorder_rate             float32
 10  prod_avg_cart_position        float32
 11  prod_order_count              int32  
 12  up_times_bought               int32  
 13  up_first_order_number         int32  
 14  up_last_order_number          int32  
 15  up_order_rate                 float32
 16  up_reorder_ratio              float32
 17  up_avg_cart_position          float32
 18  up_orders_since_last

In [20]:
import numpy as np

def optimize_dtypes(df):
    for col in df.columns:
        col_type = df[col].dtype

        # Float64 -> Float32
        if col_type == "float64":
            df[col] = df[col].astype("float32")

        # Int64 -> Int32 (safe for Instacart scale)
        elif col_type == "int64":
            df[col] = df[col].astype("int32")

    return df


In [21]:
main_training_data = optimize_dtypes(main_training_data)


In [22]:
main_training_data.info(memory_usage="deep")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13307953 entries, 0 to 13307952
Data columns (total 29 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   user_id                       int32  
 1   product_id                    float32
 2   user_total_orders             int32  
 3   user_total_products           int32  
 4   user_avg_basket_size          float32
 5   user_reorder_ratio            float32
 6   user_avg_days_between_orders  float32
 7   prod_total_purchases          int32  
 8   prod_unique_users             int32  
 9   prod_reorder_rate             float32
 10  prod_avg_cart_position        float32
 11  prod_order_count              int32  
 12  up_times_bought               int32  
 13  up_first_order_number         int32  
 14  up_last_order_number          int32  
 15  up_order_rate                 float32
 16  up_reorder_ratio              float32
 17  up_avg_cart_position          float32
 18  up_orders_since_last

In [23]:
main_training_data["up_recency"] = main_training_data["up_recency_y"]
main_training_data["up_recency_score"] = main_training_data["up_recency_score_y"]

main_training_data = main_training_data.drop(
    columns=[
        "up_recency_x",
        "up_recency_y",
        "up_recency_score_x",
        "up_recency_score_y",
    ],
    errors="ignore"
)


In [24]:
main_training_data = optimize_dtypes(main_training_data)


In [25]:
main_training_data = main_training_data.fillna(0)


In [26]:
main_training_data = optimize_dtypes(main_training_data)


In [27]:
save_path = "/kaggle/working/main_training_data_optimized.csv"

main_training_data.to_csv(save_path, index=False)

print("Saved to:", save_path)
print("Memory (GB):", main_training_data.memory_usage().sum() / 1024**3)


Saved to: /kaggle/working/main_training_data_optimized.csv
Memory (GB): 1.3385518044233322
