In [None]:
# Step 1: Install kaggle
!pip install kaggle

# Step 2: Upload your kaggle.json file
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file

# Step 3: Create directory and set permissions
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step 4: Download the competition data
!kaggle competitions download -c Medical-Equipments-Cost-Prediction-Challenge

# Step 5: Unzip the files
!unzip Medical-Equipments-Cost-Prediction-Challenge.zip



Saving kaggle.json to kaggle.json
Downloading Medical-Equipments-Cost-Prediction-Challenge.zip to /content
  0% 0.00/291k [00:00<?, ?B/s]
100% 291k/291k [00:00<00:00, 710MB/s]
Archive:  Medical-Equipments-Cost-Prediction-Challenge.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

test_ids = test_df['Hospital_Id']
train_df.set_index('Hospital_Id', inplace=True)
test_df.set_index('Hospital_Id', inplace=True)

y_train = train_df["Transport_Cost"].copy()
X_train = train_df.drop(columns=["Transport_Cost"]).copy()
X_test = test_df.copy()
#Combining for consistent feature engineering
combined_df = pd.concat([X_train, X_test], axis=0)

y_train_transformed = np.log(y_train.clip(lower=1))

combined_df['Order_Placed_Date'] = pd.to_datetime(combined_df['Order_Placed_Date'], format='%m/%d/%y')
combined_df['Delivery_Date'] = pd.to_datetime(combined_df['Delivery_Date'], format='%m/%d/%y')

combined_df['Delivery_Days'] = (combined_df['Delivery_Date'] - combined_df['Order_Placed_Date']).dt.days.astype(int)
combined_df['Order_Week'] = combined_df['Order_Placed_Date'].dt.dayofweek
combined_df['Order_Month'] = combined_df['Order_Placed_Date'].dt.month


combined_df['Equipment_Volume'] = combined_df['Equipment_Height'] * combined_df['Equipment_Width']
combined_df['Equipment_Density'] = combined_df['Equipment_Weight'] / (combined_df['Equipment_Volume'] + 1e-6)

combined_df.drop(columns=['Order_Placed_Date', 'Delivery_Date', 'Supplier_Name','Delivery_Location'], inplace=True, errors='ignore')


binary_map = {'Yes': 1, 'No': 0}
binary_cols_to_map = ['CrossBorder_Shipping', 'Installation_Service', 'Rural_Hospital']
for col in binary_cols_to_map:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].map(binary_map).fillna(0)

binary_cols = binary_cols_to_map + ['Urgent_Shipping', 'Fragile_Equipment']

#not required as freq is never < 1%
#Group Low-Frequency Categorical Features : A good preprocessing step
combined_df_copy = combined_df.copy()
categorical_cols_to_group = ['Equipment_Type', 'Transport_Method', 'Hospital_Info']
#for col in categorical_cols_to_group:
 #   if col in combined_df.columns:
  #      train_counts = combined_df.iloc[:len(X_train)][col].value_counts(normalize=True)
   #     low_freq_cats = train_counts[train_counts < 0.01].index
    #    combined_df[col] = np.where(combined_df[col].isin(low_freq_cats), 'Other', combined_df[col])


# Re-separate the data after performing  certain preoprocessing steps on both training and testing dataframes
X_train_clean = combined_df.iloc[:len(X_train)]
X_test_clean = combined_df.iloc[len(X_train):]

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import FunctionTransformer

numeric_cols = [col for col in X_train_clean.select_dtypes(include=np.number).columns.tolist() if col not in binary_cols]
categorical_cols = X_train_clean.select_dtypes(include=['object']).columns.tolist()

#Numerical columns median imputation is done
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    #('log_transform', FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
    ('scaler', StandardScaler())
])#Scaling mandatory for lasso

#Categorical columns One-hot encoding is done
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    #('imputer', SimpleImputer(strategy='most_frequent', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

X_train_processed = preprocessor.fit_transform(X_train_clean)

# Check  for any NaN values in training data
if np.any(np.isnan(X_train_processed)):
    print("Missing values found in the processed training data")
    print()
    total_missing = np.sum(np.isnan(X_train_processed))
    print("Total missing values:", total_missing)
else:
    print("No missing values found in the processed training data.")

#Processed test data
# Use only transform on the test data
X_test_processed = preprocessor.transform(X_test_clean)

# Check any NaN values in test data
if np.any(np.isnan(X_test_processed)):
    print("Missing values found in the processed test data\n")
    total_missing = np.sum(np.isnan(X_test_processed))
    print("Total missing values:", total_missing)
else:
    print("No missing values found in the processed test data")

No missing values found in the processed training data.
No missing values found in the processed test data


In [None]:
#Hyperparameter tuning(Tuning alpha)


# Define the base Lasso model
lasso_base = Lasso(max_iter=5000, random_state=42)

# Define the full pipeline for tuning
tuning_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lasso_base)
])

# Search grid for Alpha (covers a wide and then fine range)
# Alpha controls regularization strength. Smaller alpha = less regularization.
param_grid = {
    'regressor__alpha': np.logspace(-4, -1, 30)
} # 30 points between 0.0001 and 0.1 spaced logarithmically

# Cross-validation setup
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Grid Search setup
grid_search = GridSearchCV(
    tuning_pipeline,
    param_grid,
    cv=cv,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

# Execute the search
grid_search.fit(X_train_clean, y_train_transformed)

# Extract best results
best_alpha = grid_search.best_params_['regressor__alpha']
best_score = -grid_search.best_score_
print("Optimized Lasso Tuning Results")
print(f"Best Cross validation score for Lasso (on log-target): {best_score:.5f}")
print()
print(f"Best Alpha found: {best_alpha:.6f}")

Fitting 10 folds for each of 30 candidates, totalling 300 fits
Optimized Lasso Tuning Results
Best Cross validation score for Lasso (on log-target): 2.11064

Best Alpha found: 0.001743


In [None]:
# Final Model Training and Prediction


print(f"Training final Lasso model with Alpha={best_alpha:.6f} on full training set...")

# Initialize the final, highly optimized Lasso model
lasso_top_model = Lasso(alpha=best_alpha, max_iter=10000, random_state=42)

# Create the final pipeline (using the same preprocessor)
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lasso_top_model)
])

# Fit the final pipeline on full training data
final_pipeline.fit(X_train_clean, y_train_transformed)



# Make predictions
predictions_log = final_pipeline.predict(X_test_clean)

# Inverse Transform Predictions (e^x) to compensate log transform taken earlier
predictions_original_scale = np.exp(predictions_log)
predictions_original_scale = np.maximum(predictions_original_scale, 0)

# Create Submission File
submission = pd.DataFrame({
    'Hospital_Id': test_ids,
    'Transport_Cost': predictions_original_scale
})

submission.to_csv("submission_final_lasso.csv", index=False)
print()
print("Submission file 'submission_final_lasso.csv' successfully created.")

#This Lasso model uses maximum cross-validation and fine-tuned regularization designed to minimize the final score.")

Training final Lasso model with Alpha=0.001743 on full training set...

Submission file 'submission_final_lasso.csv' successfully created.
