<a href="https://colab.research.google.com/github/NiT-04/Predictive-Restaurant-Recommendation/blob/main/PredictiveRestaurantRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')


In [3]:
train_customers = pd.read_csv("/content/Restaurant_dataset/train_customers.csv")
train_locations = pd.read_csv("/content/Restaurant_dataset/train_locations.csv")
train_orders = pd.read_csv("/content/Restaurant_dataset/orders.csv")
vendors = pd.read_csv("/content/Restaurant_dataset/vendors.csv")
test_customers = pd.read_csv("/content/Restaurant_dataset/test_customers.csv")
test_locations = pd.read_csv("/content/Restaurant_dataset/test_locations.csv")

In [6]:
train_customers.head()

Unnamed: 0,customer_id,gender,dob,status,verified,language,created_at,updated_at,age
0,TCHWPBT,Male,,1,1,EN,2/7/2023 19:16,2/7/2023 19:16,32
1,ZGFSYCZ,Male,,1,1,EN,2/9/2023 12:04,2/9/2023 12:04,32
2,S2ALZFL,Male,,0,1,EN,3/14/2023 18:31,3/14/2023 18:31,32
3,952DBJQ,Male,,1,1,EN,3/15/2023 19:47,3/15/2023 19:47,32
4,1IX6FXS,Male,,1,1,EN,3/15/2023 19:57,3/15/2023 19:57,32


In [8]:
train_customers.shape

(34674, 9)

In [32]:
train_customers[train_customers['dob'].notnull()].head()

Unnamed: 0,customer_id,gender,dob,status,verified,language,created_at,updated_at,age
6,EZTXK46,7,1957.0,1,1,0,3/16/2023 21:46,3/21/2025 21:01,68
14,9VJZUWB,2,1970.0,1,1,0,4/30/2023 0:57,11/23/2024 18:34,55
19,EIA3K80,7,1985.0,1,1,0,5/3/2023 19:32,5/3/2023 19:32,40
21,XG4SHUG,2,1900.0,1,1,0,5/4/2023 6:03,5/4/2023 6:03,125
22,U3SMK7L,7,2002.0,1,1,0,5/4/2023 15:01,2/22/2025 16:38,23


In [33]:
train_customers['age'] = 2025 - pd.to_numeric(train_customers['dob'], errors='coerce')
train_customers['age'] = train_customers['age'].fillna(train_customers['age'].median()).astype(int)

In [4]:
vendors.head()

Unnamed: 0,id,authentication_id,latitude,longitude,vendor_category_en,vendor_category_id,delivery_charge,serving_distance,is_open,OpeningTime,...,open_close_flags,vendor_tag,vendor_tag_name,one_click_vendor,country_id,city_id,created_at,updated_at,device_type,display_orders
0,4,118597,-0.588596,0.754434,Restaurants,2,0.0,6,1,11:00AM-11:30PM,...,1,2458912212241623,"Arabic,Breakfast,Burgers,Desserts,Free Deliver...",Y,1,1,1/30/2023 14:42,4/7/2025 15:12,3,1
1,13,118608,-0.471654,0.74447,Restaurants,2,0.7,5,1,08:30AM-10:30PM,...,1,44151342715241628,"Breakfast,Cakes,Crepes,Italian,Pasta,Pizzas,Sa...",Y,1,1,5/3/2023 12:32,4/5/2025 20:46,3,1
2,20,118616,-0.407527,0.643681,Restaurants,2,0.0,8,1,08:00AM-10:45PM,...,1,489110,"Breakfast,Desserts,Free Delivery,Indian",Y,1,1,5/4/2023 22:28,4/7/2025 16:35,3,1
3,23,118619,-0.585385,0.753811,Restaurants,2,0.0,5,1,10:59AM-10:30PM,...,1,583024,"Burgers,Desserts,Fries,Salads",Y,1,1,5/6/2023 19:20,4/2/2025 0:56,3,1
4,28,118624,0.480602,0.55285,Restaurants,2,0.7,15,1,11:00AM-11:45PM,...,1,5,Burgers,Y,1,1,5/17/2023 22:12,4/5/2025 15:57,3,1


In [9]:
for col in ['gender', 'language', 'status', 'verified']:
    train_customers[col] = train_customers[col].fillna('unknown')
    train_customers[col] = LabelEncoder().fit_transform(train_customers[col])


In [10]:
train_locations['location_key'] = train_locations['customer_id'] + '_' + train_locations['location_number'].astype(str)


In [11]:
vendor_feats = train_orders.groupby('vendor_id').agg({
    'vendor_rating': 'mean',
    'driver_rating': 'mean',
    'grand_total': ['mean', 'count'],
    'deliverydistance': 'mean'
}).reset_index()

vendor_feats.columns = ['vendor_id', 'vendor_rating', 'driver_rating', 'avg_grand_total', 'order_count', 'avg_distance']


In [12]:
positive_samples = train_orders[['customer_id', 'vendor_id', 'LOCATION_NUMBER']].drop_duplicates()
positive_samples['target'] = 1


In [13]:
all_vendors = vendors['id'].unique()
neg_samples = []

for _, row in train_locations.iterrows():
    cid = row['customer_id']
    loc_num = row['location_number']
    used_vendors = positive_samples[(positive_samples['customer_id'] == cid) &
                                    (positive_samples['LOCATION_NUMBER'] == loc_num)]['vendor_id'].values

    negative = np.setdiff1d(all_vendors, used_vendors)
    neg_sample = np.random.choice(negative, size=min(5, len(negative)), replace=False)

    for v in neg_sample:
        neg_samples.append([cid, loc_num, v, 0])

neg_df = pd.DataFrame(neg_samples, columns=['customer_id', 'LOCATION_NUMBER', 'vendor_id', 'target'])


In [35]:
neg_df

Unnamed: 0,customer_id,location_number,vendor_id,target
0,02SFNJH,0,110,0
1,02SFNJH,0,4,0
2,02SFNJH,0,398,0
3,02SFNJH,0,159,0
4,02SFNJH,0,216,0
...,...,...,...,...
297510,HWELAU8,1,203,0
297511,HWELAU8,1,160,0
297512,HWELAU8,1,90,0
297513,HWELAU8,1,419,0


In [14]:
positive_samples.rename(columns={'LOCATION_NUMBER': 'location_number'}, inplace=True)
neg_df.rename(columns={'LOCATION_NUMBER': 'location_number'}, inplace=True)

data = pd.concat([positive_samples, neg_df], axis=0)

data = data.merge(train_customers, on='customer_id', how='left')
data = data.merge(train_locations, on=['customer_id', 'location_number'], how='left')
data = data.merge(vendor_feats, on='vendor_id', how='left')
data = data.merge(vendors.rename(columns={'id': 'vendor_id'}), on='vendor_id', how='left')


In [36]:
data

Unnamed: 0,customer_id,vendor_id,location_number,target,gender,dob,status_x,verified_x,language_x,created_at_x,...,open_close_flags,vendor_tag,vendor_tag_name,one_click_vendor,country_id,city_id,created_at_y,updated_at_y,device_type,display_orders
0,KL09J9N,84,0,1,7.0,,1.0,1.0,0.0,9/23/2023 0:22,...,1,5304823,"Burgers,Fries,Kids meal,Shawarma",Y,1,1,9/16/2023 19:37,4/7/2025 21:08,3,1
1,H5LGGFX,78,0,1,7.0,,1.0,1.0,0.0,3/21/2024 17:24,...,1,153442827248,"Pizzas,Italian,Breakfast,Soups,Pasta,Salads,De...",Y,1,1,8/26/2023 21:47,3/31/2025 22:16,3,1
2,CYLZB6T,4,0,1,7.0,,1.0,1.0,0.0,5/24/2023 16:23,...,1,2458912212241623,"Arabic,Breakfast,Burgers,Desserts,Free Deliver...",Y,1,1,1/30/2023 14:42,4/7/2025 15:12,3,1
3,4YKUKYN,157,0,1,10.0,,1.0,1.0,0.0,8/10/2024 19:43,...,1,31810336721,"Biryani,Desserts,Indian,Rice,Thali,Vegetarian",Y,1,1,1/19/2024 14:01,4/7/2025 20:03,3,1
4,WDNU30K,160,0,1,10.0,,1.0,1.0,0.0,6/20/2024 21:42,...,1,154816,"American,Burgers,Kids meal,Sandwiches",Y,1,1,1/28/2024 20:37,4/3/2025 22:36,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378736,HWELAU8,203,1,0,7.0,,1.0,1.0,0.0,10/7/2023 21:28,...,1,1162930,"American,Sandwiches,Hot Dogs,Fries",Y,1,1,3/16/2024 19:11,4/5/2025 17:48,3,1
378737,HWELAU8,160,1,0,7.0,,1.0,1.0,0.0,10/7/2023 21:28,...,1,154816,"American,Burgers,Kids meal,Sandwiches",Y,1,1,1/28/2024 20:37,4/3/2025 22:36,3,1
378738,HWELAU8,90,1,0,7.0,,1.0,1.0,0.0,10/7/2023 21:28,...,1,1585227,"American,Burgers,Desserts,Mojitos ,Pasta",Y,1,1,10/4/2023 19:41,4/5/2025 15:46,3,1
378739,HWELAU8,419,1,0,7.0,,1.0,1.0,0.0,10/7/2023 21:28,...,1,27116,"Arabic,Kushari,Sandwiches",Y,1,1,8/27/2024 15:53,4/7/2025 18:40,3,1


In [15]:
feature_cols = ['age', 'gender','driver_rating', 'avg_grand_total','order_count', 'avg_distance', 'latitude_x', 'longitude_x','latitude_y', 'longitude_y']

X = data[feature_cols]
y = data['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = lgb.LGBMClassifier()
model.fit(X_train, y_train, eval_set=[(X_val, y_val)])


y_pred = model.predict_proba(X_val)[:, 1]
print("AUC Score:", roc_auc_score(y_val, y_pred))

[LightGBM] [Info] Number of positive: 64212, number of negative: 238780
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1184
[LightGBM] [Info] Number of data points in the train set: 302992, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211926 -> initscore=-1.313353
[LightGBM] [Info] Start training from score -1.313353
AUC Score: 0.8965019121209354


In [37]:
test_customers['age'] = 2025 - pd.to_numeric(test_customers['dob'], errors='coerce')
test_customers['age'] = test_customers['age'].fillna(test_customers['age'].median()).astype(int)

In [38]:
test_customers['gender'] = test_customers['gender'].fillna('unknown')
test_customers['gender'] = LabelEncoder().fit_transform(test_customers['gender'])

In [39]:
test_df = []

for _, row in test_locations.iterrows():
    cid = row['customer_id']
    loc_num = row['location_number']
    for v in all_vendors:
        test_df.append([cid, loc_num, v])

test_df = pd.DataFrame(test_df, columns=['customer_id', 'location_number', 'vendor_id'])


test_df = test_df.merge(test_customers, on='customer_id', how='left')
test_df = test_df.merge(test_locations, on=['customer_id', 'location_number'], how='left')
test_df = test_df.merge(vendor_feats, on='vendor_id', how='left')
test_df = test_df.merge(vendors.rename(columns={'id': 'vendor_id'}), on='vendor_id', how='left')

In [34]:
test_df

Unnamed: 0,customer_id,location_number,vendor_id,gender,dob,status_x,verified_x,language_x,created_at_x,updated_at_x,...,vendor_tag_name,one_click_vendor,country_id,city_id,created_at_y,updated_at_y,device_type,display_orders,target,CID X LOC_NUM X VENDOR
0,Z59FTQD,0,4,6.0,,1.0,1.0,,2/9/2025 21:54,2/9/2025 21:54,...,"Arabic,Breakfast,Burgers,Desserts,Free Deliver...",Y,1,1,1/30/2023 14:42,4/7/2025 15:12,3,1,0.001255,Z59FTQD X 0 X 4
1,Z59FTQD,0,13,6.0,,1.0,1.0,,2/9/2025 21:54,2/9/2025 21:54,...,"Breakfast,Cakes,Crepes,Italian,Pasta,Pizzas,Sa...",Y,1,1,5/3/2023 12:32,4/5/2025 20:46,3,1,0.000939,Z59FTQD X 0 X 13
2,Z59FTQD,0,20,6.0,,1.0,1.0,,2/9/2025 21:54,2/9/2025 21:54,...,"Breakfast,Desserts,Free Delivery,Indian",Y,1,1,5/4/2023 22:28,4/7/2025 16:35,3,1,0.001356,Z59FTQD X 0 X 20
3,Z59FTQD,0,23,6.0,,1.0,1.0,,2/9/2025 21:54,2/9/2025 21:54,...,"Burgers,Desserts,Fries,Salads",Y,1,1,5/6/2023 19:20,4/2/2025 0:56,3,1,0.000926,Z59FTQD X 0 X 23
4,Z59FTQD,0,28,6.0,,1.0,1.0,,2/9/2025 21:54,2/9/2025 21:54,...,Burgers,Y,1,1,5/17/2023 22:12,4/5/2025 15:57,3,1,0.002316,Z59FTQD X 0 X 28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1673595,3O8LSR3,0,849,3.0,,1.0,1.0,EN,10/13/2023 1:44,10/13/2023 1:44,...,"American,Breakfast,Burgers,Cafe,Desserts,Free ...",Y,1,1,12/21/2024 12:47,4/7/2025 20:01,3,1,0.000911,3O8LSR3 X 0 X 849
1673596,3O8LSR3,0,855,3.0,,1.0,1.0,EN,10/13/2023 1:44,10/13/2023 1:44,...,"American,Burgers,Desserts,Free Delivery,Fries,...",Y,1,1,12/21/2024 13:02,4/7/2025 1:50,3,1,0.147209,3O8LSR3 X 0 X 855
1673597,3O8LSR3,0,856,3.0,,1.0,1.0,EN,10/13/2023 1:44,10/13/2023 1:44,...,"American,Breakfast,Burgers,Cafe,Desserts,Free ...",Y,1,1,12/21/2024 13:04,4/6/2025 23:53,3,1,0.072578,3O8LSR3 X 0 X 856
1673598,3O8LSR3,0,858,3.0,,1.0,1.0,EN,10/13/2023 1:44,10/13/2023 1:44,...,"American,Breakfast,Burgers,Cafe,Desserts,Free ...",Y,1,1,12/21/2024 13:12,4/7/2025 14:26,3,1,0.168891,3O8LSR3 X 0 X 858


In [20]:

feature_cols = [col for col in ['age', 'gender','vendor_rating', 'driver_rating', 'avg_grand_total','order_count', 'avg_distance','latitude_x', 'longitude_x', 'latitude_y', 'longitude_y'] if col in test_df.columns]


X_test = test_df[feature_cols].copy()
X_test = X_test.fillna(0)


for col in X_test.select_dtypes(include='object').columns:
    X_test[col] = LabelEncoder().fit_transform(X_test[col].astype(str))


test_df['target'] = model.predict_proba(X_test)[:, 1]


In [21]:
test_df['CID X LOC_NUM X VENDOR'] = test_df['customer_id'] + ' X ' + test_df['location_number'].astype(str) + ' X ' + test_df['vendor_id'].astype(str)
submission = test_df[['CID X LOC_NUM X VENDOR', 'target']]


submission['target'] = submission['target'].apply(lambda x: 1 if x > 0.5 else 0)


submission.to_csv('final_submission.csv', index=False)


In [22]:
summmm = pd.read_csv("/content/final_submission.csv")

In [23]:
summmm.head()

Unnamed: 0,CID X LOC_NUM X VENDOR,target
0,Z59FTQD X 0 X 4,0
1,Z59FTQD X 0 X 13,0
2,Z59FTQD X 0 X 20,0
3,Z59FTQD X 0 X 23,0
4,Z59FTQD X 0 X 28,0


In [29]:
submission[submission['target'] == 1].head(5)

Unnamed: 0,CID X LOC_NUM X VENDOR,target
113,0JP29SK X 0 X 78,1
127,0JP29SK X 0 X 113,1
213,0JP29SK X 1 X 78,1
217,0JP29SK X 1 X 83,1
224,0JP29SK X 1 X 105,1
