## Data Preprocessing 

In [1]:
#Required libraries 
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
center_info = pd.read_csv("fulfilment_center_info.csv")
meal_info = pd.read_csv("meal_info.csv")

# Merge data for both train and test
train_data = train_data.merge(center_info, on="center_id")
test_data = test_data.merge(center_info, on="center_id")
train_data = train_data.merge(meal_info, on="meal_id")
test_data = test_data.merge(meal_info, on="meal_id")

# Handle missing values if any
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Encode categorical variables using Label Encoding
label_encoder = LabelEncoder()
train_data["center_type"] = label_encoder.fit_transform(train_data["center_type"])
test_data["center_type"] = label_encoder.transform(test_data["center_type"])
train_data["category"] = label_encoder.fit_transform(train_data["category"])
test_data["category"] = label_encoder.transform(test_data["category"])
train_data["cuisine"] = label_encoder.fit_transform(train_data["cuisine"])
test_data["cuisine"] = label_encoder.transform(test_data["cuisine"])


## Feature Engineering 

In [2]:

# Calculate discount percentage
train_data["discount_percent"] = ((train_data["base_price"] - train_data["checkout_price"]) / train_data["base_price"]) * 100
test_data["discount_percent"] = ((test_data["base_price"] - test_data["checkout_price"]) / test_data["base_price"]) * 100

train_data["discount_flag"] = (train_data["checkout_price"] < train_data["base_price"]).astype(int)
test_data["discount_flag"] = (test_data["checkout_price"] < test_data["base_price"]).astype(int)

train_data["discount_amount"] = train_data["base_price"] - train_data["checkout_price"]
test_data["discount_amount"] = test_data["base_price"] - test_data["checkout_price"]

train_data["discount_duration"] = train_data.groupby(["meal_id"])["discount_flag"].cumsum()
test_data["discount_duration"] = test_data.groupby(["meal_id"])["discount_flag"].cumsum()



# # Train a model

In [3]:
#Required Libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import numpy as np

# Define features and target variable
features = ["week", "center_id", "meal_id", "checkout_price", "base_price", 
            "emailer_for_promotion", "homepage_featured", "center_type", 
            "op_area", "category", "cuisine", "discount_percent"]
                
X = train_data[features]
y = train_data["num_orders"]

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate RMSLE
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
print(f"RMSLE on validation set: {rmsle}")


RMSLE on validation set: 0.4961848877140298


## Prediction 

In [4]:

# Define test features
X_test = test_data[features]

# Make predictions for Weeks 146-155
test_predictions = model.predict(X_test)

# Ensure predictions are non-negative (since you can't have negative orders)
test_predictions = np.maximum(test_predictions, 0)


## Results 

In [5]:

# Create a DataFrame with ID and predicted number of orders
submission_df = pd.DataFrame({"id": test_data["id"], "num_orders": test_predictions})

# Save the submission file
submission_df.to_csv("Submission.csv", index=False)
