In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('food_orders_new_delhi.csv')

In [18]:
import pandas as pd

# Load the dataset
df = pd.read_csv('https://statso.io/wp-content/uploads/2024/02/food_orders_new_delhi.csv')

# Inspect the first few rows
print("First few rows of the dataset:")
print(df.head())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Handle missing values
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Fill missing values in numeric columns with mean
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Fill missing values in categorical columns with mode
for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify that there are no more missing values
print("\nMissing values after cleaning:")
print(df.isnull().sum())

# Drop irrelevant columns if necessary
# Example:df.drop(['irrelevant_column'], axis=1, inplace=True)

# Convert data types if necessary
# Example: df['column_name'] = df['column_name'].astype('int')

# Print cleaned dataframe head to verify
print("\nCleaned dataset head:")
print(df.head())

First few rows of the dataset:
   Order ID Customer ID Restaurant ID  Order Date and Time  \
0         1       C8270         R2924  2024-02-01 01:11:52   
1         2       C1860         R2054  2024-02-02 22:11:04   
2         3       C6390         R2870  2024-01-31 05:54:35   
3         4       C6191         R2642  2024-01-16 22:52:49   
4         5       C6734         R2799  2024-01-29 01:19:30   

  Delivery Date and Time  Order Value  Delivery Fee    Payment Method  \
0    2024-02-01 02:39:52         1914             0       Credit Card   
1    2024-02-02 22:46:04          986            40    Digital Wallet   
2    2024-01-31 06:52:35          937            30  Cash on Delivery   
3    2024-01-16 23:38:49         1463            50  Cash on Delivery   
4    2024-01-29 02:48:30         1992            30  Cash on Delivery   

  Discounts and Offers  Commission Fee  Payment Processing Fee  \
0            5% on App             150                      47   
1                  10%   

In [24]:
# Count occurrences of each delivery method
delivery_method_counts = df['Payment Method'].value_counts()

# Most common delivery method
most_common_delivery_method = delivery_method_counts.idxmax()
proportion_most_common = delivery_method_counts.max() / len(df)

print("Most common delivery method: " + str(most_common_delivery_method))
print("Proportion: " + str(proportion_most_common))

Most common delivery method: Cash on Delivery
Proportion: 0.357


In [26]:
from statsmodels.stats.proportion import proportion_confint

# Number of successes (most common method count)
successes = delivery_method_counts.max()
total = len(df)

# Confidence interval
lower_bound, upper_bound = proportion_confint(successes, total, alpha=0.05, method='normal')

print("95% Confidence Interval: [" + str(lower_bound) + ", " + str(upper_bound) + "]")

95% Confidence Interval: [0.3273047021609623, 0.38669529783903767]


In [28]:
average_commission_fee = df['Commission Fee'].mean()
median_commission_fee = df['Commission Fee'].median()

print("Average Commission Fee: " + str(average_commission_fee))
print("Median Commission Fee: " + str(median_commission_fee))

Average Commission Fee: 126.99
Median Commission Fee: 127.0


In [30]:
average_order_value = df['Order Value'].mean()

print("Average Order Value: " + str(average_order_value))

Average Order Value: 1053.969


In [32]:
import scipy.stats as stats

mean_commission_fee = df['Commission Fee'].mean()
std_commission_fee = df['Commission Fee'].std()

# Probability more than 120
prob_more_than_120 = 1 - stats.norm.cdf(120, loc=mean_commission_fee, scale=std_commission_fee)

# Probability less than 143
prob_less_than_143 = stats.norm.cdf(143, loc=mean_commission_fee, scale=std_commission_fee)

# Probability between 86 and 133
prob_between_86_and_133 = stats.norm.cdf(133, loc=mean_commission_fee, scale=std_commission_fee) - stats.norm.cdf(86, loc=mean_commission_fee, scale=std_commission_fee)

print("P(commission > 120): " + str(prob_more_than_120))
print("P(commission < 143): " + str(prob_less_than_143))
print("P(86 < commission < 133): " + str(prob_between_86_and_133))

P(commission > 120): 0.5644716352141099
P(commission < 143): 0.6449686242052821
P(86 < commission < 133): 0.3849064343549144


In [34]:
credit_card_orders = df[df['Payment Method'] == 'Credit Card']
average_delivery_time_credit_card = credit_card_orders['Delivery Date and Time'].apply(lambda x: pd.to_datetime(x) - pd.to_datetime(credit_card_orders['Order Date and Time'].iloc[0])).mean().total_seconds() / 3600

print("Average Delivery Time (Credit Card): " + str(average_delivery_time_credit_card))

Average Delivery Time (Credit Card): -259.1780110452778


In [36]:
cod_orders = df[df['Payment Method'] == 'Cash on Delivery']
cod_orders_sorted = cod_orders.sort_values(by='Order Value', ascending=False)
top_10_percent_cod = cod_orders_sorted.head(int(len(cod_orders) * 0.1))
lowest_order_value_top_10_percent = top_10_percent_cod['Order Value'].min()

print("Lowest Order Value (Top 10% COD): " + str(lowest_order_value_top_10_percent))

Lowest Order Value (Top 10% COD): 1810


In [38]:
digital_wallet_orders = df[df['Payment Method'] == 'Digital Wallet']
digital_wallet_orders_sorted = digital_wallet_orders.sort_values(by='Order Value')
bottom_60_percent_digital_wallet = digital_wallet_orders_sorted.head(int(len(digital_wallet_orders) * 0.6))
highest_order_value_bottom_60_percent = bottom_60_percent_digital_wallet['Order Value'].max()

print("Highest Order Value (Bottom 60% Digital Wallet): " + str(highest_order_value_bottom_60_percent))

Highest Order Value (Bottom 60% Digital Wallet): 1186


In [40]:
mean_digital_wallet = digital_wallet_orders['Order Value'].mean()
mean_cash_on_delivery = cod_orders['Order Value'].mean()
difference = mean_digital_wallet - mean_cash_on_delivery

print("Difference in Mean Values: " + str(difference))

Difference in Mean Values: -9.976657329598538


In [42]:
from scipy.stats import ttest_1samp

# Null hypothesis: mean delivery time <= 53 minutes
# Alternative hypothesis: mean delivery time > 53 minutes

# Calculate delivery times in minutes
df['Delivery Time'] = (pd.to_datetime(df['Delivery Date and Time']) - pd.to_datetime(df['Order Date and Time'])).dt.total_seconds() / 60

# Perform one-sample t-test
t_statistic, p_value = ttest_1samp(df['Delivery Time'], 53, alternative='greater')

print("T-statistic: " + str(t_statistic))
print("P-value: " + str(p_value))

if p_value < 0.05:
    print("Reject null hypothesis, delivery time is significantly greater than 53 minutes.")
else:
    print("Fail to reject null hypothesis, no significant evidence.")

T-statistic: 25.010005535385886
P-value: 6.854686967539162e-108
Reject null hypothesis, delivery time is significantly greater than 53 minutes.


In [44]:
percentile_25 = df['Delivery Time'].quantile(0.25)
percentile_75 = df['Delivery Time'].quantile(0.75)
range_middle_50_percent = percentile_75 - percentile_25

print("Range of Middle 50% Delivery Times: " + str(range_middle_50_percent))

Range of Middle 50% Delivery Times: 46.0
