In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import MiniBatchKMeans, AffinityPropagation
from sklearn.mixture import GaussianMixture

from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
# train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
# train_df = pd.concat([train_df, train_extra_df], ignore_index=True)


target = 'price'

def prepare_data(df: pd.DataFrame, is_train: bool = True):
    """
    Prepares the dataset for training or testing by renaming columns, handling missing values,
    converting categorical and numerical features, and creating new features.
    
    Args:
        df (pd.DataFrame): The input dataframe (train or test).
        is_train (bool): Indicates if the dataframe is training data (default is True).
        
    Returns:
        pd.DataFrame: The processed dataframe.
    """
    
    # Define the column names
    columns = [
        'id', 'brand', 'material', 'size', 'compartments', 
        'laptop_compartment', 'is_waterproof', 'style', 'color', 
        'weight_capacity'
    ]
    
    if is_train:
        columns.append('price')
    
    df.columns = columns
    df = df.drop(columns='id')
    
    # Define the mapping for Size conversion
    size_mapping = {"Small": 1, "Medium": 2, "Large": 3}
    df["size_int"] = df["size"].map(size_mapping).fillna(0).astype(int)
    
    # Handle weight capacity
    df['weight_capacity'] = df['weight_capacity'].fillna(0)
    df['weight_capacity_int'] = df['weight_capacity'].astype(int)
    df['weight_capacity_size'] = df['weight_capacity'] * df['size_int']
    
    # Convert categorical columns
    df['compartments'] = df['compartments'].astype('category')
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    df[cat_cols] = df[cat_cols].astype('category')
    
    # Convert boolean columns to integer type
    df['laptop_compartment'] = df['laptop_compartment'].cat.codes.fillna(-1).astype(int)
    df['is_waterproof'] = df['is_waterproof'].cat.codes.fillna(-1).astype(int)
    
    return df

# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)

In [5]:
model_features = [
    'weight_capacity', 'weight_capacity_int', 'weight_capacity_size', 'size_int', 'color', 'compartments', 'brand', 'material', 'is_waterproof'
]
numeric_cols = ['weight_capacity', 'weight_capacity_int', 'weight_capacity_size', 'size_int', 'is_waterproof']
cat_cols = ['brand', 'material', 'compartments', 'color']

numeric_transformer = Pipeline(
    steps=[('scaler', StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[('onehot', OneHotEncoder(handle_unknown='ignore', drop=None))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop'  # drop other columns not listed
)

X = train_df[model_features].copy()
y = train_df[target]

In [6]:
### MiniBatchKMeans
model_start_time = time.time()
kmeans_pipeline = Pipeline(
    steps=[
        ('preprocessing', preprocessor),
        ('clusterer', MiniBatchKMeans(n_clusters=5, random_state=42))
    ]
)

kmeans_pipeline.fit(X)
kmeans_labels = kmeans_pipeline.named_steps['clusterer'].labels_
train_df['cluster'] = kmeans_labels
print(f"MiniBatchKMeans Training & Prediction time: {time.time() - model_start_time:.2f} seconds")

print("MiniBatchKMeans cluster counts:")
display(pd.Series(kmeans_labels).value_counts().reset_index())

MiniBatchKMeans Training & Prediction time: 0.83 seconds
MiniBatchKMeans cluster counts:


Unnamed: 0,index,count
0,4,70538
1,2,68946
2,1,55846
3,3,54823
4,0,49847


In [7]:
def cv_clusters(cluster_labels_dict, X, y, kf, verbose=True):
    # LightGBM parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'verbose': -1,
        'force_row_wise': True
    }

    # This will store, for each clustering method, the RMSEs per model across folds
    all_results = {}
    # X = train_df[['cluster'] + model_features].copy()
    
    # Prepare a dict to accumulate fold RMSE scores for each model
    model_scores = {name: [] for name in cluster_labels_dict.keys()}

    # K-fold cross validation
    for fold, (train_index, valid_index) in enumerate(kf.split(train_df), start=1):

        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        # Loop over each clustering method and its cluster labels
        for method_name, n_clusters in cluster_labels_dict.items():

            if n_clusters == 1:
                X_train['cluster'] = 0
                X_valid['cluster'] = 0
            else:
                kmeans_pipeline = Pipeline(
                    steps=[
                        ('preprocessing', preprocessor),
                        ('clusterer', MiniBatchKMeans(n_clusters=n_clusters, random_state=42))
                    ]
                )
                
                kmeans_pipeline.fit(X_train)
                kmeans_labels = kmeans_pipeline.named_steps['clusterer'].labels_
                X_train['cluster'] = kmeans_labels
                X_valid['cluster'] = kmeans_pipeline.predict(X_valid)

            y_pred_valid = y_valid.reset_index()
            y_pred_valid.loc[:, 'y_pred'] = 0
            y_pred_valid = y_pred_valid.set_index('index')
    
            # Train a separate model for each cluster
            for c in range(n_clusters):
                train_cluster = X_train[X_train.cluster == c].copy()
                val_cluster = X_valid[X_valid.cluster == c].copy()
                
                train_data = lgb.Dataset(train_cluster[model_features], 
                                         label=y_train[X_train.cluster == c])
    
                fit_model = lgb.train(
                    params,
                    train_data,
                    num_boost_round=100,
                    valid_sets=[train_data],
                )
                
                y_pred_valid.loc[X_valid.cluster == c, 'y_pred'] = fit_model.predict(val_cluster[model_features], num_iteration=fit_model.best_iteration)
    
            # Calculate RMSE for this model across all validation samples
            rmse = np.sqrt(mean_squared_error(y_valid, y_pred_valid['y_pred']))
            model_scores[method_name].append(rmse)

        # Summarize scores for this method
        df_scores = pd.DataFrame(model_scores)
        all_results[method_name] = df_scores

    return all_results

In [8]:
cluster_labels_dict = {
    "baseline": 1,
    "kmeans_2": 2,
    "kmeans_3": 3,
    "kmeans_4": 4,
    "kmeans_5": 5,
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cluster_cv_results = cv_clusters(cluster_labels_dict, X, y, kf, verbose=True)

# Inspect results
result_kmeans = cluster_cv_results["kmeans_5"]
print("KMeans per-fold RMSE:")
# display(result_kmeans.round(3))

# Summarize
summary_kmeans = pd.DataFrame({
    "Mean RMSE": result_kmeans.mean(),
    "Std RMSE": result_kmeans.std()
})
summary_kmeans.round(3)

KMeans per-fold RMSE:


Unnamed: 0,baseline,kmeans_2,kmeans_3,kmeans_4,kmeans_5
0,38.913,38.927,38.933,38.954,38.968
1,39.04,39.057,39.064,39.064,39.078
2,39.016,39.022,39.038,39.052,39.08
3,39.062,39.07,39.082,39.086,39.097
4,39.003,39.01,39.024,39.029,39.046


Unnamed: 0,Mean RMSE,Std RMSE
baseline,39.007,0.057
kmeans_2,39.017,0.056
kmeans_3,39.028,0.058
kmeans_4,39.037,0.051
kmeans_5,39.054,0.051


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')