In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer


In [50]:
df = pd.read_csv(r"C:\Users\kunal\Downloads\Food_Delivery_Time_Prediction.csv")

In [51]:
print(df.isnull().sum())


Order_ID                      0
Customer_Location             0
Restaurant_Location           0
Distance                      0
Weather_Conditions            0
Traffic_Conditions            0
Delivery_Person_Experience    0
Order_Priority                0
Order_Time                    0
Vehicle_Type                  0
Restaurant_Rating             0
Customer_Rating               0
Delivery_Time                 0
Order_Cost                    0
Tip_Amount                    0
dtype: int64


In [52]:
# Handle missing values
# For numerical columns, fill missing with median
num_cols = ['Distance', 'Delivery_Time', 'Order_Cost']
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])


In [53]:
# For categorical columns, fill missing with mode
cat_cols = ['Weather_Conditions', 'Traffic_Conditions', 'Vehicle_Type']
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])


In [54]:
# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])



In [55]:
import matplotlib.pyplot as plt
import seaborn as sns


In [56]:
print(df.describe())


           Distance  Delivery_Person_Experience  Restaurant_Rating  \
count  2.000000e+02                  200.000000         200.000000   
mean  -1.643130e-16                    5.250000           3.738500   
std    1.002509e+00                    2.745027           0.703021   
min   -1.608593e+00                    1.000000           2.500000   
25%   -7.924313e-01                    3.000000           3.200000   
50%   -1.806765e-01                    5.000000           3.800000   
75%    7.325599e-01                    8.000000           4.300000   
max    1.963762e+00                   10.000000           5.000000   

       Customer_Rating  Delivery_Time    Order_Cost  Tip_Amount  
count       200.000000   2.000000e+02  2.000000e+02  200.000000  
mean          3.686500   3.730349e-16 -2.042810e-16   46.616650  
std           0.697063   1.002509e+00  1.002509e+00   29.361706  
min           2.600000  -1.857269e+00 -1.688954e+00    1.240000  
25%           3.100000  -7.896704e-01 -

In [57]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

ValueError: could not convert string to float: 'ORD0001'

<Figure size 1000x800 with 0 Axes>

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x=df['Delivery_Time'])
plt.title('Delivery Time Outliers')
plt.show()


In [None]:
# If lat/lon available, compute actual distance using Haversine formula
import math

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of earth in km
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlambda/2)**2
    return 2*R*np.arcsin(np.sqrt(a))




In [None]:
df['Order_Hour'] = pd.to_datetime(df['Order_Time']).dt.hour
df['Rush_Hour'] = df['Order_Hour'].apply(lambda x: 1 if 7 <= x <= 10 or 17 <= x <= 20 else 0)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define features and target
X = df.drop(['Delivery_Time'], axis=1)
y = df['Delivery_Time']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit linear regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = lr_model.predict(X_test)
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'R² score: {r2_score(y_test, y_pred)}')



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

# Create binary target: Fast (Delivery_Time below median), Delayed otherwise
median_time = df['Delivery_Time'].median()
df['Delivery_Status'] = (df['Delivery_Time'] > median_time).astype(int)  # 1 = Delayed, 0 = Fast

X_cls = df.drop(['Delivery_Time', 'Delivery_Status'], axis=1)
y_cls = df['Delivery_Status']

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train_cls, y_train_cls)

y_pred_cls = log_model.predict(X_test_cls)

# Evaluation
print(f'Accuracy: {accuracy_score(y_test_cls, y_pred_cls)}')
print(f'Precision: {precision_score(y_test_cls, y_pred_cls)}')
print(f'Recall: {recall_score(y_test_cls, y_pred_cls)}')
print(f'F1 Score: {f1_score(y_test_cls, y_pred_cls)}')

# Confusion matrix
cm = confusion_matrix(y_test_cls, y_pred_cls)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fast', 'Delayed'], yticklabels=['Fast', 'Delayed'])
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
y_prob = log_model.predict_proba(X_test_cls)[:,1]
fpr, tpr, thresholds = roc_curve(y_test_cls, y_prob)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0,1],[0,1],'--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend()
plt.show()

