In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your data
file_path = r"C:\Users\kunal\Downloads\Food_Delivery_Time_Prediction.csv"  # Update this path if local or use gdown to download from Google Drive
df = pd.read_csv(file_path)

# Inspect the data
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Order_ID                    200 non-null    object 
 1   Customer_Location           200 non-null    object 
 2   Restaurant_Location         200 non-null    object 
 3   Distance                    200 non-null    float64
 4   Weather_Conditions          200 non-null    object 
 5   Traffic_Conditions          200 non-null    object 
 6   Delivery_Person_Experience  200 non-null    int64  
 7   Order_Priority              200 non-null    object 
 8   Order_Time                  200 non-null    object 
 9   Vehicle_Type                200 non-null    object 
 10  Restaurant_Rating           200 non-null    float64
 11  Customer_Rating             200 non-null    float64
 12  Delivery_Time               200 non-null    float64
 13  Order_Cost                  200 non

In [25]:
# Impute missing values (example: fill numerical with mean, categorical with mode)
for col in df.select_dtypes(include='number').columns:
    df[col].fillna(df[col].mean(), inplace=True)
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [26]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_cols = ['weather', 'traffic', 'vehicle_type']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


KeyError: 'weather'

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
continuous_cols = ['distance', 'delivery_time']
df[continuous_cols] = scaler.fit_transform(df[continuous_cols])


In [None]:
import numpy as np

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius (km)
    lat1_rad, lon1_rad, lat2_rad, lon2_rad = np.radians([lat1, lon1, lat2, lon2])
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

df['geo_distance'] = haversine(
    df['restaurant_lat'], df['restaurant_lon'],
    df['customer_lat'], df['customer_lon']
)


In [None]:
# Suppose delivery_time > threshold is delayed, <= threshold is fast
threshold = df['delivery_time'].median()
df['delivery_status'] = (df['delivery_time'] > threshold).astype(int)


In [None]:
from sklearn.model_selection import train_test_split

features = ['distance', 'geo_distance', 'weather', 'traffic', 'vehicle_type']  # update as necessary
X = df[features]
y = df['delivery_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_gnb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gnb))


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': list(range(3, 16))}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='f1')
grid.fit(X_train, y_train)
print("Best K:", grid.best_params_)

knn_best = grid.best_estimator_
y_pred_knn = knn_best.predict(X_test)
print(classification_report(y_test, y_pred_knn))
print(confusion_matrix(y_test, y_pred_knn))


In [None]:
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
}
dt = DecisionTreeClassifier(random_state=42)
grid_dt = GridSearchCV(dt, param_grid, cv=5, scoring='f1')
grid_dt.fit(X_train, y_train)
print("Best Params:", grid_dt.best_params_)

dt_best = grid_dt.best_estimator_
y_pred_dt = dt_best.predict(X_test)
print(classification_report(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))


In [None]:
results = {
    'Model': ['Naive Bayes', 'KNN', 'Decision Tree'],
    'Accuracy': [accuracy_score(y_test, y_pred_gnb), accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_dt)],
    'Precision': [precision_score(y_test, y_pred_gnb), precision_score(y_test, y_pred_knn), precision_score(y_test, y_pred_dt)],
    'Recall': [recall_score(y_test, y_pred_gnb), recall_score(y_test, y_pred_knn), recall_score(y_test, y_pred_dt)],
    'F1 Score': [f1_score(y_test, y_pred_gnb), f1_score(y_test, y_pred_knn), f1_score(y_test, y_pred_dt)]
}
comparison_df = pd.DataFrame(results)
print(comparison_df)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

plot_confusion_matrix(y_test, y_pred_gnb, "Naive Bayes")
plot_confusion_matrix(y_test, y_pred_knn, "KNN")
plot_confusion_matrix(y_test, y_pred_dt, "Decision Tree")

# ROC curve for each
plt.figure()
for model, y_pred_prob, name in [
    (gnb, gnb.predict_proba(X_test)[:,1], 'Naive Bayes'),
    (knn_best, knn_best.predict_proba(X_test)[:,1], 'KNN'),
    (dt_best, dt_best.predict_proba(X_test)[:,1], 'Decision Tree')
]:
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc(fpr, tpr):.2f})')
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend()
plt.show()


In [None]:
3. Actionable Insights
Model Strengths and Weaknesses:

Naive Bayes: Fast and easy, but may struggle if feature independence is violated.

KNN: Simple, but computationally expensive for large datasets.

Decision Tree: Highly interpretable, but can overfit (hence, pruning is important).

Recommendation:

If interpretability is critical: Choose Decision Tree.

If speed and scalability are important: Choose Naive Bayes.

If slightly higher accuracy is desired with proper tuning: Choose KNN.

Final Deliverables
Jupyter Notebook: Includes all code, analysis, and visualizations.

Embedded Images: Confusion matrices and ROC curves in the notebook.

Final Report: Summarizing findings, model performances, and recommendations (can be written as Markdown or in a .docx, .pdf, etc.).