<a href="https://colab.research.google.com/github/Gajabinkar-venkatesh/OCTANET_FEBRUARY/blob/main/assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, log_loss
from xgboost import XGBClassifier

In [3]:
file_path = '/content/dining_info (1).xlsx'  # Update this path if needed
data = pd.ExcelFile(file_path)
df = data.parse('Sheet1')

In [4]:
df['order_time'] = pd.to_datetime(df['order_time'])

In [17]:
# feature
features_df = df[df['order_time'] < '2024-01-01']
train_df = df[(df['order_time'] >= '2024-01-01') & (df['order_time'] <= '2024-10-01')]
test_df = df[df['order_time'] > '2024-10-01']

In [6]:
total_orders_per_customer = features_df.groupby('customer_id')['transaction_id'].count().rename('total_orders_per_customer')
avg_spend_per_customer = features_df.groupby('customer_id')['price_for_1'].mean().rename('avg_spend_per_customer')
total_qty_per_customer = features_df.groupby('customer_id')['Qty'].sum().rename('total_qty_per_customer')
customer_features = pd.concat([total_orders_per_customer, avg_spend_per_customer, total_qty_per_customer], axis=1)

In [7]:
avg_price_per_cuisine = features_df.groupby('Preferred Cusine')['price_for_1'].mean().rename('avg_price_per_cuisine')
total_orders_per_cuisine = features_df.groupby('Preferred Cusine')['transaction_id'].count().rename('total_orders_per_cuisine')
cuisine_features = pd.concat([avg_price_per_cuisine, total_orders_per_cuisine], axis=1)


In [8]:
train_df = train_df.merge(customer_features, on='customer_id', how='left')
train_df = train_df.merge(cuisine_features, on='Preferred Cusine', how='left')
test_df = test_df.merge(customer_features, on='customer_id', how='left')
test_df = test_df.merge(cuisine_features, on='Preferred Cusine', how='left')


In [9]:
# Encode the target variable 'dish'
label_encoder = LabelEncoder()
train_df['dish_encoded'] = label_encoder.fit_transform(train_df['dish'])
test_df['dish_encoded'] = label_encoder.transform(test_df['dish'])


In [11]:
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # Change 'sparse' to 'sparse_output'
preferred_cuisine_encoded = one_hot_encoder.fit_transform(train_df[['Preferred Cusine']])
preferred_cuisine_df = pd.DataFrame(preferred_cuisine_encoded,
                                    columns=one_hot_encoder.get_feature_names_out(['Preferred Cusine']),
                                    index=train_df.index)
train_df = pd.concat([train_df, preferred_cuisine_df], axis=1)

preferred_cuisine_encoded_test = one_hot_encoder.transform(test_df[['Preferred Cusine']])
preferred_cuisine_test_df = pd.DataFrame(preferred_cuisine_encoded_test,
                                         columns=one_hot_encoder.get_feature_names_out(['Preferred Cusine']),
                                         index=test_df.index)
test_df = pd.concat([test_df, preferred_cuisine_test_df], axis=1)

In [12]:
columns_to_drop = ['transaction_id', 'customer_id', 'order_time', 'dish', 'Preferred Cusine', 'check_in_date', 'check_out_date']
train_df = train_df.drop(columns=columns_to_drop, axis=1)
test_df = test_df.drop(columns=columns_to_drop, axis=1)


In [13]:
# Split features and target
X_train = train_df.drop(columns=['dish_encoded'])
y_train = train_df['dish_encoded']
X_test = test_df.drop(columns=['dish_encoded'])
y_test = test_df['dish_encoded']



In [14]:
# Train XGBoost Classifier
xgb_model = XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    learning_rate=0.1,
    max_depth=3,
    n_estimators=100
)
xgb_model.fit(X_train, y_train)


In [15]:
# Step 7: Model Evaluation
# Predictions and Metrics
y_test_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
logloss = log_loss(y_test, xgb_model.predict_proba(X_test))

print(f"Accuracy: {accuracy:.2f}")
print(f"Log Loss: {logloss:.2f}")



Accuracy: 0.78
Log Loss: 0.40


In [16]:
# Feature Importance
feature_importances = xgb_model.feature_importances_
important_features = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
print(important_features.sort_values(by='Importance', ascending=False))

                          Feature  Importance
3                     price_for_1    0.827315
2                             Qty    0.022727
7          avg_spend_per_customer    0.021441
10       total_orders_per_cuisine    0.021231
4               number_of_stayers    0.018734
0                      Unnamed: 0    0.016117
9           avg_price_per_cuisine    0.015411
1                             age    0.015216
8          total_qty_per_customer    0.014553
6       total_orders_per_customer    0.014510
5           booked_through_points    0.012747
11         Preferred Cusine_Multi    0.000000
12  Preferred Cusine_North Indian    0.000000
13  Preferred Cusine_South Indian    0.000000
