In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.drop(columns=["Unnamed: 0","Booking_ID"], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df["no_of_adults"].value_counts()

In [None]:
df["no_of_children"].value_counts()

In [None]:
df["no_of_weekend_nights"].value_counts()

In [None]:
df["no_of_week_nights"].value_counts()

In [None]:
df["type_of_meal_plan"].value_counts()

In [None]:
df["required_car_parking_space"].value_counts()

In [None]:
df["room_type_reserved"].value_counts()

In [None]:
df["arrival_year"].value_counts()

In [None]:
df["arrival_month"].value_counts()

In [None]:
df["market_segment_type"].value_counts()

In [None]:
df["repeated_guest"].value_counts()

In [None]:
df["required_car_parking_space"].value_counts()

In [None]:
df["no_of_previous_cancellations"].value_counts()

# Data is imbalanced

In [None]:
cat_cols = ["type_of_meal_plan", "room_type_reserved", "market_segment_type", "repeated_guest", "required_car_parking_space", "booking_status"]

num_cols = ["no_of_adults", "no_of_children", "no_of_weekend_nights", "no_of_week_nights", "lead_time", "arrival_year", "arrival_month", "arrival_date", "avg_price_per_room", "no_of_previous_cancellations", "no_of_previous_bookings_not_canceled", "no_of_special_requests"]

In [None]:
len(cat_cols), len(num_cols)

# DATA ANALYSIS

In [None]:
data = df.copy()

In [None]:
data.head()

## Univariate Analysis

In [None]:
def num_plot_dist(df, num_features):
    fig, axes = plt.subplots(len(num_features), 2, figsize=(15, 5*len(num_features)))
    if len(num_features) == 1:
        axes = [axes]

    for i, column in enumerate(num_features):
        
        sns.histplot(data = df, x=column, kde=True, ax=axes[i][0], palette="Blues")
        axes[i][0].set_title(f"Distribution of {column}")

        sns.boxplot(data = df, x=column, ax=axes[i][1], palette="Blues")
        axes[i][1].set_title(f"Boxplot of {column}")
        
    plt.tight_layout()
    plt.show()


In [None]:
num_plot_dist(data, num_cols)

In [None]:
for cat_features in cat_cols:
    plt.figure(figsize=(10,6))
    data[cat_features].value_counts().plot(kind='bar', color='skyblue')
    plt.title(f"Count of {cat_features}")
    plt.xlabel(cat_features)
    plt.ylabel("Count")
    plt.show()

## Bivariate Analysis

In [None]:
def plot_bivariate_num(df, target, num_features):
    num_plots=len(num_features)
    num_rows=(num_plots+1)//2

    fig, axes = plt.subplots(num_rows, 2, figsize=(15, 5*num_rows))
    axes = axes.flatten()
    
    for i, column in enumerate(num_features):
        sns.boxplot(data = df, x=target, y=column, ax=axes[i], palette="Blues")
        axes[i].set_title(f"{column} VS {target}")
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_bivariate_num(data, "booking_status", num_cols)

In [None]:
sns.barplot(x="arrival_month", y="avg_price_per_room", data=data, palette="Blues")
plt.show()

In [None]:
def plot_bivariate_cat(df, target, cat_features):

    num_features = len(cat_features)
    num_rows = (num_features+1)//2

    fig, axes = plt.subplots(num_rows, 2, figsize=(15, 5*num_rows))
    axes = axes.flatten()
    for i, feature in enumerate(cat_features):
        sns.countplot(data = df, x=feature, hue=target, ax=axes[i], palette="Set2")
        axes[i].set_title(f"{feature} VS {target}")
        axes[i].tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_bivariate_cat(data, "booking_status", cat_cols)

# Data Processing

In [None]:
df.head()

In [None]:
df.info()

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()

mappings = {}

for col in cat_cols:
    df[col] = label_encoder.fit_transform(df[col])
    mappings[col] = {label:code for label, code in zip(label_encoder.classes_ , label_encoder.transform(label_encoder.classes_))}


In [None]:
mappings

In [None]:
df.head()

In [None]:
df.info()

## Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
x = add_constant(df)

vif_data = pd.DataFrame()

vif_data["feature"] = x.columns

In [None]:
vif_data

In [None]:
vif_data["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]

In [None]:
vif_data["VIF"]

#### NOTE : Data don't have any multicollinearity

In [None]:
corr = df.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

## Skewness

In [None]:
skewness = df.skew()
skewness

In [None]:
for col in df.columns:
    if skewness[col] > 5:
        df[col] = np.log1p(df[col])

In [None]:
skewness = df.skew()
skewness

## Imbalanced Data Handling

In [None]:
df["booking_status"].value_counts()

In [None]:
X = df.drop(columns=["booking_status"])
y = df["booking_status"]

In [None]:
X.columns

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=42)

X_res, y_res = smote.fit_resample(X, y)

In [None]:
y_res.value_counts()

In [None]:
balanced_df = pd.DataFrame(X_res, columns=X.columns)
balanced_df["booking_status"] = y_res

In [None]:
balanced_df.head()

In [None]:
balanced_df.info()

In [None]:
df = balanced_df.copy()

In [None]:
df.shape

## Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = df.drop(columns=["booking_status"])
y = df["booking_status"]


In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

In [None]:
feature_importance = model.feature_importances_

In [None]:
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": feature_importance
})

In [None]:
feature_importance_df

In [None]:
top_important_features_df = feature_importance_df.sort_values(by="Importance", ascending=False)

In [None]:
top_10_features = top_important_features_df["Feature"].head(10).values

top_10_df = df[top_10_features.tolist() + ["booking_status"]]


In [None]:
len(top_10_df.columns)

In [None]:
df = top_10_df.copy()

## Model Selection

In [None]:
%pip install xgboost lightgbm 

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [None]:
X = df.drop(columns=["booking_status"])
y = df["booking_status"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
classifiers = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machine": SVC(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

In [None]:
metrics = {
    "Model" : [],
    "Accuracy" : [],
    "Recall" : [],
    "Precision" : [],
    "F1 Score" : []
}

In [None]:
for model_name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    Accuracy = accuracy_score(y_test, y_pred)
    Recall = recall_score(y_test, y_pred)
    Precision = precision_score(y_test, y_pred)
    F1_Score = f1_score(y_test, y_pred)

    metrics["Model"].append(model_name)
    metrics["Accuracy"].append(Accuracy)
    metrics["Recall"].append(Recall)
    metrics["Precision"].append(Precision)
    metrics["F1 Score"].append(F1_Score)

In [None]:
print(metrics)

In [None]:
metrics_df = pd.DataFrame(metrics)
metrics_df

#### Random Forest seems the best option , we will perform Hyperparameter tuning on it

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
X = df.drop(columns=["booking_status"])
y = df["booking_status"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(random_state=42)

In [None]:
params_dist = {
    "n_estimators": randint(100, 500),
    "max_depth": randint(10, 50),
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 5),
    "bootstrap": [True,False]
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=params_dist,
    n_iter=10,
    cv=5,
    verbose=2,
    random_state=42,
    scoring="accuracy"
)

In [None]:
random_search.fit(X_train, y_train)

In [None]:
random_search.best_params_

In [None]:
best_rf_model = random_search.best_estimator_

In [None]:
y_pred = best_rf_model.predict(X_test)

In [None]:
Precision = precision_score(y_test, y_pred)
Recall = recall_score(y_test, y_pred)
F1_Score = f1_score(y_test, y_pred)
Accuracy = accuracy_score(y_test, y_pred)

In [None]:
Precision

In [None]:
Recall

In [None]:
Accuracy

In [None]:
F1_Score

### Save the best model

In [None]:
import joblib

joblib.dump(best_rf_model, "random_forest.pkl")

In [None]:
loaded_model = joblib.load("random_forest.pkl")

In [None]:
X_train[0:1]

In [None]:
new_data = np.array([190, 1, 93.5, 9, 8, 4, 5, 2, 0, 0]).reshape(1, -1)

In [None]:
predictions = loaded_model.predict(new_data)

In [None]:
predictions