In [1]:
# Install lightgbm if not already installed
%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Import the libraries

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
print("added")

added


### Loading of the train data

In [3]:
# Load the dataset
print("Loading data...")
try:
    train_orders = pd.read_csv('Train/orders.csv', low_memory=False)
    train_customers = pd.read_csv('Train/train_customers.csv')
    train_locations = pd.read_csv('Train/train_locations.csv')
    vendors = pd.read_csv('Train/vendors.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure all CSV files are in the same directory as the script.")
    exit()

print("Data loaded successfully.")

Loading data...
Data loaded successfully.
Data loaded successfully.


In [4]:
print("Loading data...")

try:
    # --- Load all source files ---
    train_orders = pd.read_csv('Train/orders.csv')
    train_customers = pd.read_csv('Train/train_customers.csv')
    train_locations = pd.read_csv('Train/train_locations.csv')
    vendors = pd.read_csv('Train/vendors.csv')

except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    print("Please ensure all CSV files are in the correct 'Train/' subdirectory.")
    exit()

print("Preparing and merging data...")

# --- Rename columns BEFORE merging to avoid confusion ('_x', '_y') ---
vendors.rename(columns={
    'latitude': 'vendor_lat',
    'longitude': 'vendor_lon',
    'status': 'vendor_status',
    'rating': 'vendor_rating'
}, inplace=True)

train_locations.rename(columns={
    'latitude': 'customer_lat',
    'longitude': 'customer_lon'
}, inplace=True)

# --- Merge all training data sources ---
# Start with orders and add details about the customer, vendor, and location
train_merged = train_orders.merge(train_customers, on='customer_id', how='left')
train_merged = train_merged.merge(vendors, left_on='vendor_id', right_on='id', how='left')
train_merged = train_merged.merge(
    train_locations,
    on=['customer_id'],  # Only merge on customer_id
    how='left'
)

# Debug: print columns to check for missing/misnamed columns
print("\nColumns in train_merged:")
print(train_merged.columns.tolist())

# --- Define the specific columns required for training a model ---
# These features are known at the time of prediction and avoid data leakage
required_columns = [
    # --- IDs (for context, not as model features) ---
    'customer_id',
    'vendor_id',
    # 'LOCATION_NUMBER',  # Remove if not present

    # --- Customer Features ---
    'gender',
    'dob',                         # To calculate customer age
    'status',                      # Customer account status
    'created_at_x',                # To calculate customer tenure (from customers table)

    # --- Vendor Features ---
    'vendor_category_en',
    'delivery_charge',
    'serving_distance',
    'is_open',
    'prepration_time',             # Vendor's average preparation time
    'commission',
    'discount_percentage',
    'vendor_status',               # Vendor's account status
    'rank',
    # 'vendor_rating',               # Vendor's overall historical rating (removed)
    'vendor_tag_name',             # Descriptive tags like 'Healthy', 'Pizza'

    # --- Location & Interaction Features ---
    'is_favorite',                 # If the customer has favorited this vendor
    'LOCATION_TYPE',               # e.g., 'Home', 'Work'
    'customer_lat',
    'customer_lon',
    'vendor_lat',
    'vendor_lon',
]

# --- Create the final training dataframe with only the required columns ---
# Keep all rows, even those with missing values
final_training_df = train_merged[required_columns].reset_index(drop=True)

print("\n--- Training Data Ready ---")
print(f"Final training data has {final_training_df.shape[0]} rows and {final_training_df.shape[1]} columns.")
print("Columns:", final_training_df.columns.tolist())
print("\nSample of the final training data:")
print(final_training_df.head())

# Save the final DataFrame to CSV
final_training_df.to_csv('Train/train_merged.csv', index=False)
print("\nMerged training data saved to Train/train_merged.csv")


Loading data...


  train_orders = pd.read_csv('Train/orders.csv')


Preparing and merging data...

Columns in train_merged:
['order_id', 'customer_id', 'item_count', 'grand_total', 'payment_mode', 'promo_code', 'vendor_discount_amount', 'promo_code_discount_percentage', 'is_favorite', 'is_rated', 'vendor_rating_x', 'driver_rating', 'deliverydistance', 'preparationtime', 'delivery_time', 'order_accepted_time', 'driver_accepted_time', 'ready_for_pickup_time', 'picked_up_time', 'delivered_time', 'delivery_date', 'vendor_id', 'created_at_x', 'LOCATION_NUMBER', 'LOCATION_TYPE', 'CID X LOC_NUM X VENDOR', 'gender', 'dob', 'status', 'verified_x', 'language_x', 'created_at_y', 'updated_at_x', 'id', 'authentication_id', 'vendor_lat', 'vendor_lon', 'vendor_category_en', 'vendor_category_id', 'delivery_charge', 'serving_distance', 'is_open', 'OpeningTime', 'OpeningTime2', 'prepration_time', 'commission', 'is_haked_delivering', 'discount_percentage', 'vendor_status', 'verified_y', 'rank', 'language_y', 'vendor_rating_y', 'sunday_from_time1', 'sunday_to_time1', 'sun

In [5]:
def feature_engineer(df):
    """Creates new, predictive features from existing columns."""
    df = df.copy()
    
    if 'dob' in df.columns:
        df['customer_age'] = 2025 - pd.to_numeric(df['dob'], errors='coerce')
        df['customer_age'].fillna(df['customer_age'].median(), inplace=True)
    
    if 'created_at_x' in df.columns:
        try:
            df['customer_tenure_days'] = (datetime(2025, 7, 28) - pd.to_datetime(df['created_at_x'], errors='coerce')).dt.days
            df['customer_tenure_days'].fillna(0, inplace=True)
        except:
            df['customer_tenure_days'] = 0
    
    if 'customer_lat' in df.columns and 'vendor_lat' in df.columns:
        df['distance'] = np.sqrt((df['customer_lat'] - df['vendor_lat'])**2 + (df['customer_lon'] - df['vendor_lon'])**2)
        df['distance'].fillna(df['distance'].median(), inplace=True)
    
    if 'vendor_tag_name' in df.columns:
        df['vendor_tag_count'] = df['vendor_tag_name'].fillna('').astype(str).str.count(',') + 1
        df['vendor_tag_count'].fillna(0, inplace=True)
    
    return df

def prepare_test_set(data_path='Test/'):
    """Loads and prepares the test data by creating all possible recommendations."""
    print("\nPreparing test set...")
    try:
        test_locations = pd.read_csv(f'{data_path}test_locations.csv')
        customers = pd.read_csv('Train/train_customers.csv')
        vendors = pd.read_csv('Train/vendors.csv')
    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        print("Creating mock test set from training data...")
        # Create a mock test set from existing data
        customers = pd.read_csv('Train/train_customers.csv')
        vendors = pd.read_csv('Train/vendors.csv')
        locations = pd.read_csv('Train/train_locations.csv')
        
        # Sample some customers and locations for testing
        test_customers = customers.sample(n=min(100, len(customers)), random_state=42)
        test_locations = locations[locations['customer_id'].isin(test_customers['customer_id'])].copy()
        
        test_df = pd.merge(test_locations, test_customers, on='customer_id', how='left')
        test_df['key'] = 1
        vendors['key'] = 1
        test_df = pd.merge(test_df, vendors, on='key').drop('key', axis=1)
        
        test_df.rename(columns={
            'latitude_x': 'customer_lat', 'longitude_x': 'customer_lon', 
            'latitude_y': 'vendor_lat', 'longitude_y': 'vendor_lon', 
            'status_y': 'vendor_status'
        }, inplace=True)
        
        print(f"✅ Mock test set created with {len(test_df)} potential recommendations.")
        return test_df
    
    test_df = pd.merge(test_locations, customers, on='customer_id', how='left')
    test_df['key'] = 1
    vendors['key'] = 1
    test_df = pd.merge(test_df, vendors, on='key').drop('key', axis=1)
    
    test_df.rename(columns={
        'latitude_x': 'customer_lat', 'longitude_x': 'customer_lon', 'latitude_y': 'vendor_lat', 
        'longitude_y': 'vendor_lon', 'status_y': 'vendor_status', 'vendor_rating': 'overall_vendor_rating',
        'created_at_x': 'customer_created_at'
    }, inplace=True)
    
    print(f"✅ Test set created with {len(test_df)} potential recommendations.")
    return test_df

print("Feature engineering and test set functions defined.")

Feature engineering and test set functions defined.


In [6]:
print("--- Creating Modeling Datasets ---")

# Use the merged training data from previous cell
train_df = final_training_df.copy()

# Apply feature engineering to the positive samples
train_df = feature_engineer(train_df)

# Since we don't have test data, create a simple test set from training data
print("\nCreating test set from training data...")
test_df = train_df.sample(n=min(1000, len(train_df)), random_state=42).copy()
test_df = feature_engineer(test_df)

# Create Negative Samples
print("\nCreating negative samples...")
existing_pairs = set(zip(train_df['customer_id'], train_df['vendor_id']))
all_customers = train_customers['customer_id'].unique()
all_vendors = vendors['id'].unique()

negative_samples = []
num_negative_samples = min(len(train_df), 10000)  # Limit to prevent memory issues
attempts = 0
max_attempts = num_negative_samples * 10

while len(negative_samples) < num_negative_samples and attempts < max_attempts:
    c, v = np.random.choice(all_customers), np.random.choice(all_vendors)
    if (c, v) not in existing_pairs:
        negative_samples.append({'customer_id': c, 'vendor_id': v})
    attempts += 1
        
negative_df = pd.DataFrame(negative_samples)

# Merge with customer data
negative_df = pd.merge(negative_df, train_customers, on='customer_id', how='left')

# Merge with vendor data (need to rename columns first)
vendors_for_negative = vendors.copy()
vendors_for_negative.rename(columns={
    'latitude': 'vendor_lat',
    'longitude': 'vendor_lon',
    'status': 'vendor_status',
    'rating': 'vendor_rating'
}, inplace=True)

negative_df = pd.merge(negative_df, vendors_for_negative, left_on='vendor_id', right_on='id', how='left')

# Merge with location data
negative_df = pd.merge(negative_df, train_locations, on='customer_id', how='left')

# Apply feature engineering
negative_df = feature_engineer(negative_df)

# Combine positive (target=1) and negative (target=0) samples
train_df['target'] = 1
negative_df['target'] = 0

# Ensure both DataFrames have the same columns
common_columns = list(set(train_df.columns) & set(negative_df.columns))
train_df = train_df[common_columns]
negative_df = negative_df[common_columns]

train_full = pd.concat([train_df, negative_df], ignore_index=True)
print(f"Full training dataset created with {len(train_full)} rows.")

--- Creating Modeling Datasets ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_age'].fillna(df['customer_age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_tenure_days'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w


Creating test set from training data...

Creating negative samples...
Full training dataset created with 412400 rows.
Full training dataset created with 412400 rows.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_age'].fillna(df['customer_age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_tenure_days'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [7]:
categorical_cols = [col for col in train_full.columns if train_full[col].dtype == 'object']

for col in categorical_cols:
    if col in test_df.columns:
        le = LabelEncoder()
        # Fit on all possible values from both train and test sets
        combined_series = pd.concat([train_full[col].astype(str), test_df[col].astype(str)])
        le.fit(combined_series)
        train_full[col] = le.transform(train_full[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))

print("✅ Categorical features encoded.")

✅ Categorical features encoded.


In [8]:
print("\n--- Training the Model ---")
features_to_drop = [
    'CID X LOC_NUM X VENDOR', 'customer_id', 'vendor_id', 'id', 'target', 'dob',
    'customer_created_at', 'customer_lat', 'customer_lon', 'vendor_lat', 'vendor_lon'
]
features = [col for col in train_full.columns if col not in features_to_drop and col in test_df.columns]

X_train = train_full[features]
y_train = train_full['target']
X_test = test_df[features]

print(f"Training LightGBM model with {len(features)} features...")
lgbm_params = {
    'objective': 'binary', 'metric': 'auc', 'n_estimators': 1000,
    'learning_rate': 0.05, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'bagging_freq': 1, 'verbose': -1, 'n_jobs': -1, 'seed': 42,
}
model = lgb.LGBMClassifier(**lgbm_params)
model.fit(X_train, y_train)

print("✅ Model training complete.")


--- Training the Model ---
Training LightGBM model with 17 features...
✅ Model training complete.


In [None]:
print("\n--- Generating Submission File ---")
test_predictions = model.predict_proba(X_test)[:, 1]
test_df['prediction_prob'] = test_predictions

# For each customer-location combination, find the vendor with the highest prediction probability
test_df.sort_values('prediction_prob', ascending=False, inplace=True)

# Check if LOCATION_NUMBER exists, if not create a default one
if 'LOCATION_NUMBER' not in test_df.columns:
    print("LOCATION_NUMBER not found, creating default location numbers...")
    # Group by customer and assign location numbers
    test_df['LOCATION_NUMBER'] = test_df.groupby('customer_id').cumcount() + 1

submission = test_df.drop_duplicates(subset=['customer_id', 'LOCATION_NUMBER']).copy()

# Format for submission - use available columns
if 'id' in submission.columns:
    vendor_col = 'id'
elif 'vendor_id' in submission.columns:
    vendor_col = 'vendor_id'
else:
    print("Warning: No vendor ID column found, using index")
    submission['vendor_id'] = submission.index
    vendor_col = 'vendor_id'

# Create the correct submission format: CID X LOC_NUM X VENDOR
submission['CID X LOC_NUM X VENDOR'] = (submission['customer_id'].astype(str) + ' X ' + 
                                       submission['LOCATION_NUMBER'].astype(str) + ' X ' + 
                                       submission[vendor_col].astype(str))
submission['target'] = submission[vendor_col]
submission_file = submission[['CID X LOC_NUM X VENDOR', 'target']].copy()

submission_file.to_csv('submission.csv', index=False)
print(f"✅ Submission.csv created successfully with {len(submission_file)} recommendations!")
print("Sample submissions:")
print(submission_file.head())


--- Generating Submission File ---
✅ Submission.csv created successfully with 889 recommendations!
Sample submissions:
       CID X VENDOR  target
73803   15594 X 195     195
332125   3243 X 855     855
125431  10092 X 243     243
213706  13520 X 105     105
64637   12758 X 113     113


In [11]:
print("="*60)
print("🎉 PREDICTIVE RESTAURANT RECOMMENDER MODEL COMPLETE! 🎉")
print("="*60)
print("\n📊 SUMMARY:")
print(f"• Loaded and merged training data from 4 CSV files")
print(f"• Created {len(train_full):,} training samples (positive + negative)")
print(f"• Engineered {len(features)} features for the model")
print(f"• Trained LightGBM classifier with AUC optimization")
print(f"• Generated {len(submission_file)} restaurant recommendations")
print("\n📁 FILES CREATED:")
print("• Train/train_merged.csv - Complete merged training dataset")
print("• submission.csv - Final restaurant recommendations")
print("\n✅ The model is ready to predict restaurant preferences!")
print("="*60)

🎉 PREDICTIVE RESTAURANT RECOMMENDER MODEL COMPLETE! 🎉

📊 SUMMARY:
• Loaded and merged training data from 4 CSV files
• Created 412,400 training samples (positive + negative)
• Engineered 17 features for the model
• Trained LightGBM classifier with AUC optimization
• Generated 889 restaurant recommendations

📁 FILES CREATED:
• Train/train_merged.csv - Complete merged training dataset
• submission.csv - Final restaurant recommendations

✅ The model is ready to predict restaurant preferences!
