In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
import networkx as nx

In [None]:
# -------- 1) Paths --------
DATA_DIR = Path("/content/")  # <- change this to your actual data directory
files = {
    "customer_train": DATA_DIR / "customer_nodes_training.csv",
    "product_train":  DATA_DIR / "product_nodes_training.csv",
    "events_train":   DATA_DIR / "event_table_training.csv",
    "customer_test":  DATA_DIR / "customer_nodes_testing.csv",
    "product_test":   DATA_DIR / "product_nodes_testing.csv",
    "events_test":    DATA_DIR / "event_table_testing.csv",
}

In [None]:
# -------- 2) Load CSVs --------
df_customers = pd.read_csv(files["customer_train"])
df_products  = pd.read_csv(files["product_train"])
df_events    = pd.read_csv(files["events_train"])

print("Shapes (customers, products, events):", df_customers.shape, df_products.shape, df_events.shape)

Shapes (customers, products, events): (443343, 30) (44165, 44) (1369133, 3)


In [None]:
# -------- 3) Clean column names --------
def clean_col_names(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

df_customers = clean_col_names(df_customers)
df_products = clean_col_names(df_products)
df_events = clean_col_names(df_events)

In [None]:
# -------- 4) Identify label column --------
label_col = None
if 'isreturned' in df_events.columns:
    label_col = 'isreturned'
else:
    possible_label_names = ["return", "is_return", "label", "returned"]
    for col in df_events.columns:
        if col.strip().lower() in possible_label_names:
            label_col = col
            break

if label_col is None:
    raise ValueError("No label column found; check event table column names.")

print(f"Label column: {label_col}")
print("Label distribution (train):")
print(df_events[label_col].value_counts(normalize=False))
print(df_events[label_col].value_counts(normalize=True))

Label column: isreturned
Label distribution (train):
isreturned
1    757227
0    611906
Name: count, dtype: int64
isreturned
1    0.55307
0    0.44693
Name: proportion, dtype: float64


In [None]:
# -------- 5) Suggest ID columns --------
def suggest_id_columns(df, label):
    print(f"\n🔎 Inspecting columns in {label} DataFrame:")
    cols = df.columns.tolist()
    print("Columns:", cols)

    cust_candidates = [c for c in cols if any(k in c.lower() for k in ["cust", "customer"])]
    prod_candidates = [c for c in cols if any(k in c.lower() for k in ["prod", "product", "variant"])]
    print("Suggested customer ID columns:", cust_candidates)
    print("Suggested product ID columns:", prod_candidates)

    return cust_candidates, prod_candidates

cust_event_candidates, prod_event_candidates = suggest_id_columns(df_events, "Events")
cust_node_candidates, _ = suggest_id_columns(df_customers, "Customers")
_, prod_node_candidates = suggest_id_columns(df_products, "Products")

cust_id_col_event = cust_event_candidates[0] if cust_event_candidates else None
prod_id_col_event = prod_event_candidates[0] if prod_event_candidates else None
cust_id_col_node = cust_node_candidates[0] if cust_node_candidates else None
prod_id_col_node = prod_node_candidates[0] if prod_node_candidates else None

if not all([cust_id_col_event, prod_id_col_event, cust_id_col_node, prod_id_col_node]):
    raise ValueError("❌ Could not identify customer or product ID columns for merging. Please check the printed suggestions above.")


🔎 Inspecting columns in Events DataFrame:
Columns: ['hash(variantid)', 'hash(customerid)', 'isreturned']
Suggested customer ID columns: ['hash(customerid)']
Suggested product ID columns: ['hash(variantid)']

🔎 Inspecting columns in Customers DataFrame:
Columns: ['hash(customerid)', 'yearofbirth', 'ismale', 'shippingcountry', 'premier', 'salespercustomer', 'returnspercustomer', 'customerreturnrate', 'customerid_level_return_code_a', 'customerid_level_return_code_b', 'customerid_level_return_code_c', 'customerid_level_return_code_d', 'customerid_level_return_code_e', 'customerid_level_return_code_d.1', 'customerid_level_return_code_f', 'customerid_level_return_code_g', 'customerid_level_return_code_h', 'customerid_level_return_code_i', 'customerid_level_return_code_j', 'customerid_level_return_code_k', 'customerid_level_return_code_l', 'country_a', 'country_b', 'country_c', 'country_d', 'country_e', 'country_f', 'country_g', 'country_h', 'country_i']
Suggested customer ID columns: ['has

In [None]:
# -------- 6) Standardize ID column names --------
if cust_id_col_node != "customer_id":
    df_customers = df_customers.rename(columns={cust_id_col_node: "customer_id"})
    cust_id_col_node = "customer_id"

if prod_id_col_node != "product_id":
    df_products = df_products.rename(columns={prod_id_col_node: "product_id"})
    prod_id_col_node = "product_id"

In [None]:
# -------- 7) Merge datasets --------
df = df_events.merge(df_customers, left_on=cust_id_col_event, right_on=cust_id_col_node, how="left", suffixes=("", "_cust"))
df = df.merge(df_products, left_on=prod_id_col_event, right_on=prod_id_col_node, how="left", suffixes=("", "_prod"))

print("Merged dataset shape:", df.shape)

Merged dataset shape: (1369133, 77)


In [None]:
# -------- 8) Missingness report --------
def missing_report(df_in):
    miss = df_in.isna().mean().sort_values(ascending=False)
    return miss[miss > 0]

print("Missingness top columns:")
print(missing_report(df).head(20))

high_missing_thresh = 0.80
cols_to_drop = missing_report(df)[missing_report(df) > high_missing_thresh].index.tolist()
print("Dropping cols with >80% missing:", cols_to_drop)
df = df.drop(columns=cols_to_drop)

Missingness top columns:
producttype_g                    0.930414
producttype_h                    0.930414
producttype_a                    0.930414
producttype_b                    0.930414
producttype_c                    0.930414
producttype_j                    0.930414
producttype_i                    0.930414
producttype_k                    0.930414
brand_k                          0.930414
brand_j                          0.930414
brand_i                          0.930414
brand_g                          0.930414
brand_f                          0.930414
variantid_level_return_code_j    0.930414
variantid_level_return_code_k    0.930414
variantid_level_return_code_l    0.930414
brand_a                          0.930414
brand_b                          0.930414
brand_e                          0.930414
brand_c                          0.930414
dtype: float64
Dropping cols with >80% missing: ['producttype_g', 'producttype_h', 'producttype_a', 'producttype_b', 'producttype_c', '

In [None]:
# -------- 9) Feature types --------
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if label_col in numeric_cols:
    numeric_cols.remove(label_col)

categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
ids_to_remove_from_cat = [cust_id_col_event, prod_id_col_event, 'customer_id', 'product_id']
categorical_cols = [c for c in categorical_cols if c not in ids_to_remove_from_cat]

print("Numeric:", len(numeric_cols), "Categorical:", len(categorical_cols))

Numeric: 31 Categorical: 1


In [None]:
# -------- 10) Preprocessing pipelines --------
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="constant", fill_value="missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # ✅ updated here
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, numeric_cols),
    ("cat", cat_pipeline, categorical_cols)
], remainder="drop", sparse_threshold=0)

X = df.drop(columns=[label_col])
y = df[label_col].astype(int)

print("Fitting preprocessor...")
preprocessor.fit(X)
X_trans = preprocessor.transform(X)
print("Transformed X shape:", X_trans.shape)

joblib.dump(preprocessor, "preprocessor_train.joblib")

Fitting preprocessor...
Transformed X shape: (1369133, 41)


['preprocessor_train.joblib']

In [None]:
# -------- 11) Train/validation split --------
X_train, X_val, y_train, y_val = train_test_split(X_trans, y, test_size=0.1, stratify=y, random_state=42)
print("Train/val sizes:", X_train.shape, X_val.shape, y_train.shape, y_val.shape)

Train/val sizes: (1232219, 41) (136914, 41) (1232219,) (136914,)


In [None]:
# -------- 12) Build bipartite graph --------
G = nx.Graph()

cust_feat_cols = [c for c in df_customers.columns if c != "customer_id"]
for _, row in df_customers.iterrows():
    node_id = f"c_{row['customer_id']}"
    if pd.notna(row['customer_id']):
        G.add_node(node_id, bipartite=0, **{k: row[k] for k in cust_feat_cols if pd.notna(row[k])})

prod_feat_cols = [c for c in df_products.columns if c != "product_id"]
for _, row in df_products.iterrows():
    node_id = f"p_{row['product_id']}"
    if pd.notna(row['product_id']):
        G.add_node(node_id, bipartite=1, **{k: row[k] for k in prod_feat_cols if pd.notna(row[k])})

for _, r in df_events.iterrows():
    c_id_event = r[cust_id_col_event]
    p_id_event = r[prod_id_col_event]
    label = int(r[label_col])

    c_node_id = f"c_{c_id_event}"
    p_node_id = f"p_{p_id_event}"

    if G.has_node(c_node_id) and G.has_node(p_node_id):
        G.add_edge(c_node_id, p_node_id, label=label)

print("Graph nodes/edges:", G.number_of_nodes(), G.number_of_edges())

Graph nodes/edges: 487508 50275


In [None]:
# -------- 13) Save processed datasets --------
joblib.dump((X_train, X_val, y_train, y_val), "tabular_train_val.joblib")
joblib.dump(df, "merged_events_train.joblib")

['merged_events_train.joblib']

## Summary:

### Q&A

*   **What are the key steps of the analysis performed in the notebook?**
    The analysis involved loading three datasets (customer, product, and event data), cleaning column names, identifying and standardizing ID columns, merging the datasets, handling missing values (dropping columns with >80% missing data, imputing remaining numerical and categorical features), identifying feature types, applying preprocessing pipelines (StandardScaler for numerical, OneHotEncoder for categorical), splitting the data into training and validation sets (stratified 90/10 split), and building a bipartite graph connecting customers and products based on events.
*   **What are the potential errors or limitations of the current analysis?**
    Potential issues include limitations of the chosen imputation strategies (median for numerical, constant for categorical), potential impact of One-Hot Encoding on feature dimensionality if applied to high-cardinality features, the `handle_unknown='ignore'` setting in OneHotEncoder which treats unknown categories as all zeros, the simplicity of the random train/validation split which might not capture temporal or group dependencies, the exclusion of customers/products from the graph if they are only in the event table but not the node files, and the exclusion of event-specific features as edge attributes in the graph.
*   **What improvements are suggested for the analysis?**
    Suggested improvements include exploring more sophisticated imputation methods (e.g., KNN, model-based), handling outliers, engineering new features, considering alternative categorical encoding methods, enriching the bipartite graph with more node and edge attributes, computing graph-based features or embeddings, exploring various modeling approaches (traditional ML, graph-enhanced models, GNNs), using more appropriate train/validation split strategies (temporal or group-based), and evaluating models using a wider range of metrics relevant to the business problem (Precision, Recall, F1-Score, AUC-ROC, cost-sensitive metrics).

### Data Analysis Key Findings

*   The analysis successfully loaded, cleaned, and merged customer, product, and event data into a single dataframe with a shape of (1369133, 77) before dropping columns.
*   The target variable `isreturned` in the training event data is slightly imbalanced, with approximately 55.3% of events being returns (1) and 44.7% being non-returns (0).
*   A significant number of columns (those with >80% missing values) were dropped from the merged dataframe during preprocessing.
*   After imputation and feature transformation using StandardScaler and OneHotEncoder, the preprocessed feature matrix (`X_trans`) has a shape of (1369133, 41).
*   The data was split into a training set of shape (1232219, 41) and a validation set of shape (136914, 41), maintaining the original label distribution through stratification.
*   A bipartite graph was constructed representing customer-product interactions, containing 487508 nodes and 50275 edges.

### Insights

*   The current analysis provides a solid foundation of data preparation and basic feature engineering. The next crucial step is to train and evaluate predictive models using the prepared tabular data and explore methods to leverage the constructed bipartite graph for improved prediction performance.
*   Given the potential limitations identified, future work should focus on implementing more advanced data preprocessing techniques, exploring sophisticated modeling approaches (including those that utilize the graph structure), and carefully selecting evaluation metrics that align with the business objectives of return prediction.
