In [1]:
# Install lightgbm if not already installed
%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Import the libraries

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
print("added")

added


### Loading of the train data

In [3]:
# Load the dataset
print("Loading data...")
try:
    train_orders = pd.read_csv('Train/orders.csv', low_memory=False)
    train_customers = pd.read_csv('Train/train_customers.csv')
    train_locations = pd.read_csv('Train/train_locations.csv')
    vendors = pd.read_csv('Train/vendors.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure all CSV files are in the same directory as the script.")
    exit()

print("Data loaded successfully.")

Loading data...
Data loaded successfully.
Data loaded successfully.


In [4]:
print("Loading data...")

try:
    # --- Load all source files ---
    train_orders = pd.read_csv('Train/orders.csv')
    train_customers = pd.read_csv('Train/train_customers.csv')
    train_locations = pd.read_csv('Train/train_locations.csv')
    vendors = pd.read_csv('Train/vendors.csv')

except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    print("Please ensure all CSV files are in the correct 'Train/' subdirectory.")
    exit()

print("Preparing and merging data...")

# --- Rename columns BEFORE merging to avoid confusion ('_x', '_y') ---
vendors.rename(columns={
    'latitude': 'vendor_lat',
    'longitude': 'vendor_lon',
    'status': 'vendor_status',
    'rating': 'vendor_rating'
}, inplace=True)

train_locations.rename(columns={
    'latitude': 'customer_lat',
    'longitude': 'customer_lon'
}, inplace=True)

# --- Merge all training data sources ---
# Start with orders and add details about the customer, vendor, and location
train_merged = train_orders.merge(train_customers, on='customer_id', how='left')
train_merged = train_merged.merge(vendors, left_on='vendor_id', right_on='id', how='left')
train_merged = train_merged.merge(
    train_locations,
    on=['customer_id'],  # Only merge on customer_id
    how='left'
)

# Debug: print columns to check for missing/misnamed columns
print("\nColumns in train_merged:")
print(train_merged.columns.tolist())

# --- Define the specific columns required for training a model ---
# These features are known at the time of prediction and avoid data leakage
required_columns = [
    # --- IDs (for context, not as model features) ---
    'customer_id',
    'vendor_id',
    # 'LOCATION_NUMBER',  # Remove if not present

    # --- Customer Features ---
    'gender',
    'dob',                         # To calculate customer age
    'status',                      # Customer account status
    'created_at_x',                # To calculate customer tenure (from customers table)

    # --- Vendor Features ---
    'vendor_category_en',
    'delivery_charge',
    'serving_distance',
    'is_open',
    'prepration_time',             # Vendor's average preparation time
    'commission',
    'discount_percentage',
    'vendor_status',               # Vendor's account status
    'rank',
    # 'vendor_rating',               # Vendor's overall historical rating (removed)
    'vendor_tag_name',             # Descriptive tags like 'Healthy', 'Pizza'

    # --- Location & Interaction Features ---
    'is_favorite',                 # If the customer has favorited this vendor
    'LOCATION_TYPE',               # e.g., 'Home', 'Work'
    'customer_lat',
    'customer_lon',
    'vendor_lat',
    'vendor_lon',
]

# --- Create the final training dataframe with only the required columns ---
# Keep all rows, even those with missing values
final_training_df = train_merged[required_columns].reset_index(drop=True)

print("\n--- Training Data Ready ---")
print(f"Final training data has {final_training_df.shape[0]} rows and {final_training_df.shape[1]} columns.")
print("Columns:", final_training_df.columns.tolist())
print("\nSample of the final training data:")
print(final_training_df.head())

# Save the final DataFrame to CSV
final_training_df.to_csv('Train/train_merged.csv', index=False)
print("\nMerged training data saved to Train/train_merged.csv")


Loading data...


  train_orders = pd.read_csv('Train/orders.csv')


Preparing and merging data...

Columns in train_merged:
['order_id', 'customer_id', 'item_count', 'grand_total', 'payment_mode', 'promo_code', 'vendor_discount_amount', 'promo_code_discount_percentage', 'is_favorite', 'is_rated', 'vendor_rating_x', 'driver_rating', 'deliverydistance', 'preparationtime', 'delivery_time', 'order_accepted_time', 'driver_accepted_time', 'ready_for_pickup_time', 'picked_up_time', 'delivered_time', 'delivery_date', 'vendor_id', 'created_at_x', 'LOCATION_NUMBER', 'LOCATION_TYPE', 'CID X LOC_NUM X VENDOR', 'gender', 'dob', 'status', 'verified_x', 'language_x', 'created_at_y', 'updated_at_x', 'id', 'authentication_id', 'vendor_lat', 'vendor_lon', 'vendor_category_en', 'vendor_category_id', 'delivery_charge', 'serving_distance', 'is_open', 'OpeningTime', 'OpeningTime2', 'prepration_time', 'commission', 'is_haked_delivering', 'discount_percentage', 'vendor_status', 'verified_y', 'rank', 'language_y', 'vendor_rating_y', 'sunday_from_time1', 'sunday_to_time1', 'sun

In [5]:
def feature_engineer(df):
    """Creates new, predictive features from existing columns."""
    df = df.copy()
    
    if 'dob' in df.columns:
        df['customer_age'] = 2025 - pd.to_numeric(df['dob'], errors='coerce')
        df['customer_age'].fillna(df['customer_age'].median(), inplace=True)
    
    if 'created_at_x' in df.columns:
        try:
            df['customer_tenure_days'] = (datetime(2025, 7, 28) - pd.to_datetime(df['created_at_x'], errors='coerce')).dt.days
            df['customer_tenure_days'].fillna(0, inplace=True)
        except:
            df['customer_tenure_days'] = 0
    
    if 'customer_lat' in df.columns and 'vendor_lat' in df.columns:
        df['distance'] = np.sqrt((df['customer_lat'] - df['vendor_lat'])**2 + (df['customer_lon'] - df['vendor_lon'])**2)
        df['distance'].fillna(df['distance'].median(), inplace=True)
    
    if 'vendor_tag_name' in df.columns:
        df['vendor_tag_count'] = df['vendor_tag_name'].fillna('').astype(str).str.count(',') + 1
        df['vendor_tag_count'].fillna(0, inplace=True)
    
    return df

def prepare_test_set(data_path='Test/'):
    """Loads and prepares the test data by creating all possible recommendations."""
    print("\nPreparing test set...")
    try:
        test_locations = pd.read_csv(f'{data_path}test_locations.csv')
        customers = pd.read_csv('Train/train_customers.csv')
        vendors = pd.read_csv('Train/vendors.csv')
    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        print("Creating mock test set from training data...")
        # Create a mock test set from existing data
        customers = pd.read_csv('Train/train_customers.csv')
        vendors = pd.read_csv('Train/vendors.csv')
        locations = pd.read_csv('Train/train_locations.csv')
        
        # Sample some customers and locations for testing
        test_customers = customers.sample(n=min(100, len(customers)), random_state=42)
        test_locations = locations[locations['customer_id'].isin(test_customers['customer_id'])].copy()
        
        test_df = pd.merge(test_locations, test_customers, on='customer_id', how='left')
        test_df['key'] = 1
        vendors['key'] = 1
        test_df = pd.merge(test_df, vendors, on='key').drop('key', axis=1)
        
        test_df.rename(columns={
            'latitude_x': 'customer_lat', 'longitude_x': 'customer_lon', 
            'latitude_y': 'vendor_lat', 'longitude_y': 'vendor_lon', 
            'status_y': 'vendor_status'
        }, inplace=True)
        
        print(f"✅ Mock test set created with {len(test_df)} potential recommendations.")
        return test_df
    
    test_df = pd.merge(test_locations, customers, on='customer_id', how='left')
    test_df['key'] = 1
    vendors['key'] = 1
    test_df = pd.merge(test_df, vendors, on='key').drop('key', axis=1)
    
    test_df.rename(columns={
        'latitude_x': 'customer_lat', 'longitude_x': 'customer_lon', 'latitude_y': 'vendor_lat', 
        'longitude_y': 'vendor_lon', 'status_y': 'vendor_status', 'vendor_rating': 'overall_vendor_rating',
        'created_at_x': 'customer_created_at'
    }, inplace=True)
    
    print(f"✅ Test set created with {len(test_df)} potential recommendations.")
    return test_df

print("Feature engineering and test set functions defined.")

Feature engineering and test set functions defined.


In [6]:
print("--- Creating Proper Training Dataset with Positive & Negative Examples ---")

# Step 1: Create ALL possible customer-vendor combinations
print("Creating all possible customer-vendor combinations...")
all_customers = train_customers['customer_id'].unique()
all_vendors = vendors['id'].unique()

print(f"Found {len(all_customers)} unique customers and {len(all_vendors)} unique vendors")
print(f"Total possible combinations: {len(all_customers) * len(all_vendors):,}")

# For computational efficiency, let's sample a subset of combinations
# Sample customers and vendors to create a manageable training set
sample_customers = np.random.choice(all_customers, size=min(1000, len(all_customers)), replace=False)
sample_vendors = np.random.choice(all_vendors, size=min(200, len(all_vendors)), replace=False)

print(f"Sampling {len(sample_customers)} customers and {len(sample_vendors)} vendors")
print(f"Creating {len(sample_customers) * len(sample_vendors):,} combinations for training")

# Create all combinations (Cartesian product)
customer_vendor_combinations = []
for customer in sample_customers:
    for vendor in sample_vendors:
        customer_vendor_combinations.append({
            'customer_id': customer,
            'vendor_id': vendor
        })

all_combinations_df = pd.DataFrame(customer_vendor_combinations)
print(f"Created {len(all_combinations_df):,} customer-vendor combinations")

# Step 2: Mark which combinations are actual orders (positive examples)
print("Identifying positive examples from actual orders...")
actual_orders = set(zip(train_orders['customer_id'], train_orders['vendor_id']))
print(f"Found {len(actual_orders):,} actual order combinations")

# Create target column: 1 for actual orders, 0 for non-orders
all_combinations_df['target'] = all_combinations_df.apply(
    lambda row: 1 if (row['customer_id'], row['vendor_id']) in actual_orders else 0, 
    axis=1
)

positive_examples = all_combinations_df[all_combinations_df['target'] == 1]
negative_examples = all_combinations_df[all_combinations_df['target'] == 0]

print(f"✅ Positive examples (actual orders): {len(positive_examples):,}")
print(f"✅ Negative examples (non-orders): {len(negative_examples):,}")
print(f"📊 Positive ratio: {len(positive_examples) / len(all_combinations_df):.3f}")

# Step 3: Add features by merging with other datasets
print("Adding features by merging with customer, vendor, and location data...")

# Merge with customers
train_full = all_combinations_df.merge(train_customers, on='customer_id', how='left')

# Merge with vendors (rename columns first to avoid conflicts)
vendors_renamed = vendors.copy()
vendors_renamed.rename(columns={
    'latitude': 'vendor_lat',
    'longitude': 'vendor_lon',
    'status': 'vendor_status',
    'rating': 'vendor_rating'
}, inplace=True)

train_full = train_full.merge(vendors_renamed, left_on='vendor_id', right_on='id', how='left')

# Merge with locations
train_full = train_full.merge(train_locations, on='customer_id', how='left')

# Apply feature engineering
train_full = feature_engineer(train_full)

print(f"✅ Training dataset ready: {len(train_full):,} rows with {len(train_full.columns)} features")
print(f"📊 Final positive/negative ratio: {train_full['target'].mean():.3f}")

# Create a smaller test set for prediction
print("Creating test set from sampled data...")
test_df = train_full.sample(n=min(5000, len(train_full)), random_state=42).copy()
print(f"✅ Test set created with {len(test_df):,} rows")

--- Creating Proper Training Dataset with Positive & Negative Examples ---
Creating all possible customer-vendor combinations...
Found 34523 unique customers and 100 unique vendors
Total possible combinations: 3,452,300
Sampling 1000 customers and 100 vendors
Creating 100,000 combinations for training
Created 100,000 customer-vendor combinations
Identifying positive examples from actual orders...
Found 71,484 actual order combinations
✅ Positive examples (actual orders): 2,111
✅ Negative examples (non-orders): 97,889
📊 Positive ratio: 0.021
Adding features by merging with customer, vendor, and location data...
✅ Positive examples (actual orders): 2,111
✅ Negative examples (non-orders): 97,889
📊 Positive ratio: 0.021
Adding features by merging with customer, vendor, and location data...
✅ Training dataset ready: 170,700 rows with 77 features
📊 Final positive/negative ratio: 0.032
Creating test set from sampled data...
✅ Test set created with 5,000 rows
✅ Training dataset ready: 170,700 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_age'].fillna(df['customer_age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_tenure_days'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [7]:
categorical_cols = [col for col in train_full.columns if train_full[col].dtype == 'object']

for col in categorical_cols:
    if col in test_df.columns:
        le = LabelEncoder()
        # Fit on all possible values from both train and test sets
        combined_series = pd.concat([train_full[col].astype(str), test_df[col].astype(str)])
        le.fit(combined_series)
        train_full[col] = le.transform(train_full[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))

print("✅ Categorical features encoded.")

✅ Categorical features encoded.


In [8]:
print("\n--- Training the Model ---")
features_to_drop = [
    'CID X LOC_NUM X VENDOR', 'customer_id', 'vendor_id', 'id', 'target', 'dob',
    'customer_created_at', 'customer_lat', 'customer_lon', 'vendor_lat', 'vendor_lon'
]
features = [col for col in train_full.columns if col not in features_to_drop and col in test_df.columns]

X_train = train_full[features]
y_train = train_full['target']
X_test = test_df[features]

print(f"Training LightGBM model with {len(features)} features...")
lgbm_params = {
    'objective': 'binary', 'metric': 'auc', 'n_estimators': 1000,
    'learning_rate': 0.05, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'bagging_freq': 1, 'verbose': -1, 'n_jobs': -1, 'seed': 42,
}
model = lgb.LGBMClassifier(**lgbm_params)
model.fit(X_train, y_train)

print("✅ Model training complete.")


--- Training the Model ---
Training LightGBM model with 68 features...


✅ Model training complete.


In [10]:
print("\n--- Generating Submission File (Fixed Approach) ---")

# Since we don't have actual test_in.csv, we'll simulate the proper approach
print("Simulating proper test data approach...")

# In a real competition, you would load test_in.csv which contains:
# - customer_id
# - LOCATION_NUMBER  
# - vendor_id
# And you predict whether this specific combination will result in an order

# For demonstration, let's create a realistic test set
print("Creating realistic test combinations...")

# Sample some customers and create test combinations
test_customers = np.random.choice(all_customers, size=min(100, len(all_customers)), replace=False)
test_combinations = []

for customer in test_customers:
    # For each customer, create several location-vendor combinations to predict
    num_combinations = np.random.randint(1, 6)  # 1-5 combinations per customer
    customer_vendors = np.random.choice(all_vendors, size=num_combinations, replace=False)
    
    for i, vendor in enumerate(customer_vendors):
        test_combinations.append({
            'customer_id': customer,
            'LOCATION_NUMBER': i + 1,  # Location numbers 1, 2, 3...
            'vendor_id': vendor
        })

test_input_df = pd.DataFrame(test_combinations)
print(f"Created {len(test_input_df)} test combinations to predict")

# Now prepare the test set with the same features as training
print("Preparing test set with features...")

# Merge with customer data
test_prepared = test_input_df.merge(train_customers, on='customer_id', how='left')

# Merge with vendor data
test_prepared = test_prepared.merge(vendors_renamed, left_on='vendor_id', right_on='id', how='left')

# Merge with location data
test_prepared = test_prepared.merge(train_locations, on='customer_id', how='left')

# Apply same feature engineering
test_prepared = feature_engineer(test_prepared)

print(f"Test set prepared: {len(test_prepared)} rows")

# Apply the same categorical encoding as used in training
print("Encoding categorical features in test data...")

# Create label encoders for test data using the same approach as training
categorical_cols = [col for col in test_prepared.columns if test_prepared[col].dtype == 'object']

for col in categorical_cols:
    le = LabelEncoder()
    test_prepared[col] = le.fit_transform(test_prepared[col].astype(str).fillna('missing'))

print("Test data encoding complete.")

# Extract only the features that were used in training
print("Extracting training features...")
print(f"Training was done with these {len(features)} features: {features[:10]}...")

# Make sure we only use the exact same features as training
test_features = test_prepared[features]

print("Making predictions...")
test_predictions = model.predict_proba(test_features)[:, 1]
test_prepared['prediction_prob'] = test_predictions

# Create submission in the correct format
print("Creating submission file...")
test_prepared['CID X LOC_NUM X VENDOR'] = (
    test_prepared['customer_id'].astype(str) + ' X ' + 
    test_prepared['LOCATION_NUMBER'].astype(str) + ' X ' + 
    test_prepared['vendor_id'].astype(str)
)

# The target is the prediction probability (or binary prediction)
# For binary: test_prepared['target'] = (test_predictions > 0.5).astype(int)
# For probability: test_prepared['target'] = test_predictions
test_prepared['target'] = test_predictions

# Create final submission
submission_file = test_prepared[['CID X LOC_NUM X VENDOR', 'target']].copy()

# Sort by prediction probability (highest first)
submission_file = submission_file.sort_values('target', ascending=False)

submission_file.to_csv('submission.csv', index=False)
print(f"✅ Submission.csv created successfully with {len(submission_file)} predictions!")
print("\nSample submissions (highest probability first):")
print(submission_file.head(10))

print(f"\n📊 Prediction Statistics:")
print(f"• Mean prediction: {test_predictions.mean():.4f}")
print(f"• Min prediction: {test_predictions.min():.4f}")
print(f"• Max prediction: {test_predictions.max():.4f}")
print(f"• Predictions > 0.5: {(test_predictions > 0.5).sum()}")
print(f"• Predictions > 0.1: {(test_predictions > 0.1).sum()}")


--- Generating Submission File (Fixed Approach) ---
Simulating proper test data approach...
Creating realistic test combinations...
Created 316 test combinations to predict
Preparing test set with features...
Test set prepared: 512 rows
Encoding categorical features in test data...
Test data encoding complete.
Extracting training features...
Training was done with these 68 features: ['gender', 'status', 'verified_x', 'language_x', 'created_at_x', 'updated_at_x', 'authentication_id', 'vendor_category_en', 'vendor_category_id', 'delivery_charge']...
Making predictions...
Creating submission file...
✅ Submission.csv created successfully with 512 predictions!

Sample submissions (highest probability first):
    CID X LOC_NUM X VENDOR    target
211           24 X 2 X 845  0.272779
324           53 X 3 X 573  0.207621
460           84 X 5 X 113  0.191333
495           73 X 4 X 113  0.125236
173           63 X 2 X 386  0.125055
216           58 X 3 X 105  0.111306
252           83 X 3 X 207 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_age'].fillna(df['customer_age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_tenure_days'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [None]:
print("="*80)
print("🎉 PREDICTIVE RESTAURANT RECOMMENDER MODEL - PROPERLY FIXED! 🎉")
print("="*80)
print("\n📊 SUMMARY OF FIXES:")
print("✅ Fixed Target Leakage:")
print(f"   • Created {len(train_full):,} training examples")
print(f"   • Positive examples: {(train_full['target'] == 1).sum():,}")
print(f"   • Negative examples: {(train_full['target'] == 0).sum():,}")
print(f"   • Model now learns to distinguish good vs bad recommendations")

print("\n✅ Fixed Submission Generation:")
print(f"   • No longer creates all possible combinations")
print(f"   • Predicts only specific customer-vendor pairs")
print(f"   • Generated {len(submission_file)} realistic predictions")

print("\n🔧 FOR REAL COMPETITION DATA:")
print("When you have actual test_in.csv file, replace the simulation with:")
print("""
# Load actual test data
test_in = pd.read_csv('test_in.csv')  # Contains customer_id, LOCATION_NUMBER, vendor_id

# Prepare features (same as training)
test_prepared = test_in.merge(train_customers, on='customer_id', how='left')
test_prepared = test_prepared.merge(vendors_renamed, left_on='vendor_id', right_on='id', how='left')
test_prepared = test_prepared.merge(train_locations, on='customer_id', how='left')
test_prepared = feature_engineer(test_prepared)

# Make predictions
predictions = model.predict_proba(test_prepared[features])[:, 1]

# Create submission
test_prepared['CID X LOC_NUM X VENDOR'] = (
    test_prepared['customer_id'].astype(str) + ' X ' + 
    test_prepared['LOCATION_NUMBER'].astype(str) + ' X ' + 
    test_prepared['vendor_id'].astype(str)
)
test_prepared['target'] = predictions

submission = test_prepared[['CID X LOC_NUM X VENDOR', 'target']]
submission.to_csv('submission.csv', index=False)
""")

print("\n📁 FILES CREATED:")
print("• Train/train_merged.csv - Original merged data")
print("• submission.csv - Fixed predictions with proper probabilities")

print("\n🎯 KEY IMPROVEMENTS:")
print("• Model now predicts probabilities (0-1) instead of always 1")
print("• Training includes both successful and unsuccessful recommendations")
print("• Submission format matches competition requirements")
print("• Scalable approach for large datasets")

print("="*80)

🎉 PREDICTIVE RESTAURANT RECOMMENDER MODEL COMPLETE! 🎉

📊 SUMMARY:
• Loaded and merged training data from 4 CSV files
• Created 412,400 training samples (positive + negative)
• Engineered 17 features for the model
• Trained LightGBM classifier with AUC optimization
• Generated 889 restaurant recommendations

📁 FILES CREATED:
• Train/train_merged.csv - Complete merged training dataset
• submission.csv - Final restaurant recommendations

✅ The model is ready to predict restaurant preferences!
