In [76]:
#Install if necessary
#!pip install kmodes
#!pip install statsmodels


In [77]:
#Import libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [None]:
# Set pandas display options to show all columns
pd.set_option('display.max_columns', None)


# Load the dataset
df = pd.read_csv('preprocessed_hotel_booking.csv')
df.head()

In [None]:
df.dtypes

In [80]:
# Convert date columns to datetime
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
df['arrival_date'] = pd.to_datetime(df['arrival_date'])

In [None]:
df.dtypes

In [None]:
df.columns

## Booking Cancellation Analysis

In [None]:
# Calculate cancellation percentage
cancellation_percentage = df['is_canceled'].value_counts(normalize=True) * 100

# Create bar plot
plt.figure(figsize=(8, 6))
sns.barplot(x=cancellation_percentage.index, y=cancellation_percentage.values, hue=cancellation_percentage.index, legend=False)
plt.title('Percentage of Not Canceled vs Canceled Bookings')
plt.xlabel('Booking Status')
plt.ylabel('Percentage')

# Add data labels
for i, v in enumerate(cancellation_percentage.values):
    plt.text(i, v, f'{v:.0f}%', ha='center', va='bottom')

plt.show()


The bar chart shows the distribution of canceled and not canceled bookings in the dataset. A significant portion of bookings were not canceled, indicating a relatively high booking success rate. However, for the percentage of bookings that were canceled, there is need for further investigation into the factors contributing to cancellations. Further analysis could focus on identifying these factors and developing strategies to mitigate cancellations. 




In [None]:
# Create a cross-tabulation of reservation status and is_canceled
reservation_status_counts = pd.crosstab(df['reservation_status'], df['is_canceled'])

# Calculate the total number of cancellations and no-shows
total_cancellations = reservation_status_counts.loc['Canceled',:].sum()
total_no_shows = reservation_status_counts.loc['No-Show',:].sum()

# Pie chart
sizes = [total_cancellations, total_no_shows]
colors = ['#FFB347', '#90EE90']
plt.figure(figsize=(8, 6))
plt.pie(sizes, autopct='%1.0f%%', startangle=60, colors=colors)
plt.title('Percentage of Canceled vs. No-Show Bookings')
plt.axis('equal')
plt.legend(['Canceled', 'No-Show'])
plt.show()

Majority of absences by guests are cancellations prior to arrival rather than no-shows therefore the main investigation would be based on the cancellations

### **Numerical Feature Analysis**

In [None]:
# Select numerical features and target variable
numerical_features = ['lead_time', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests','arrival_date']
target_variable = 'is_canceled'

# Compute the correlation matrix
correlation_matrix = df[numerical_features + [target_variable]].corr()

# Select the target variable correlations
correlation_with_target = correlation_matrix[target_variable].drop(target_variable)

# Sort correlations in descending order
correlation_with_target = correlation_with_target.sort_values(ascending=False)

# Visualizing correlation coefficients between features and cancellation:
fig = plt.figure(figsize=(8,10))
correlation_with_target.plot(kind='bar')
plt.title('Correlation of Features with Cancellation')
plt.xlabel('Features')
plt.ylabel('Correlation Coefficient')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


## Correlation Analysis of Cancellation Prediction

The correlation analysis reveals key factors influencing booking cancellations.  Let's examine the top and bottom two features:

**Top 2 (Positive) Correlations:**

1. **`lead_time`:** A strong positive correlation suggests that longer lead times are associated with a higher likelihood of cancellation. This could be due to guests having more time to change their plans or find alternative options.

2. **`previous_cancellations`:**  A positive correlation indicates that guests with a history of cancellations are more likely to cancel future bookings. This highlights the importance of identifying and potentially managing high-risk customers.


**Bottom 2 (Negative) Correlations:**

To gain a comprehensive understanding, we must also analyze the features exhibiting the two lowest (most negative) correlation coefficients. These features **required_car_parking_spaces** and **total_of_special_requests** showed negative correlations, implying guests with special needs are less likely to cancel

N.B **arrival_date_year** and **children** count showed minimal impact on cancellations


### **Categorical Feature Analysis**

In [None]:
# Create count plots for categorical features
cat_features = ['hotel', 'meal', 'market_segment', 'distribution_channel', 
                'deposit_type', 'customer_type']

plt.figure(figsize=(20, 15))
for i, feature in enumerate(cat_features, 1):
    plt.subplot(3, 2, i)
    sns.countplot(x=feature, hue='is_canceled', data=df, palette=['#ADD8E6','#FFA07A']) #Pastel blue and pastel orange
    plt.title(f'Cancellations by {feature}')
    plt.xticks(rotation=45)
plt.tight_layout()

In [None]:
print(df['deposit_type'].value_counts())

In [None]:
print(df['distribution_channel'].value_counts())

In [None]:
print(df['market_segment'].value_counts())

The count plots for categorical features provided valuable insights:
- **Hotel Type**: City hotels had higher cancellation rates than resort hotels
- **Meal Plan**: Room-only bookings (BB) had higher cancellations than full board
- **Market Segment**: Online TA (Travel Agents) showed highest cancellation rates
- **Distribution Channel**: Direct bookings had lower cancellations than TA/TO
- **Deposit Type**: Non-refundable deposits had significantly lower cancellations
- **Customer Type**: Transient customers had higher cancellations than contract/group


N.B The absence of certain categories in the plots (e.g., 'Refundable' deposit type or 'GDS' distribution channel shown above) can be due to several factors:
 
1. **Data Sparsity**: Some categories may have very few or no instances in the dataset, making them invisible in the plots
2. **Data Cleaning**: These categories might have been removed during preprocessing if they contained missing or invalid data
3. **Business Reality**: Some categories might genuinely be rare in the hotel's operations (e.g., GDS channels might not be commonly used)
4. **Data Collection**: The hotel might not have collected data for certain categories during the period covered by the dataset


### **Time Series Analysis**

In [None]:
# Monthly cancellation rates
df['arrival_month'] = df['arrival_date'].dt.month
monthly_cancellations = df.groupby(['arrival_month', 'is_canceled']).size().unstack()
monthly_cancellations.plot(kind='bar', figsize=(12,6))
plt.title('Cancellations by Month')
plt.xlabel('Month')
plt.ylabel('Number of Bookings')
plt.legend(['Not Canceled', 'Canceled'])

### Analysis of Monthly Cancellation Rates

Peak travel months: Summer months(usually April-August) and December

Off-peak travel months: January-March, September-November

The bar chart reveals several key insights about cancellations for months throughout the year:

1. **Seasonal Trends**: 
- Higher cancellation rates are observed during peak travel months
- Lower cancellation rates are seen in off-peak months

2. **Booking Behavior**:
- The total number of bookings (canceled + not canceled) is highest during summer months, indicating increased demand
- The proportion of cancellations relative to total bookings remains relatively consistent across months

3 and 4 are insights on how this bar chart analysis can help with market research and hotel campaign restructuring

3. **Business Implications**:
- The hotel should consider implementing stricter cancellation policies during peak seasons
- Revenue management strategies could be adjusted based on these seasonal patterns
- Marketing efforts could be intensified during low-cancellation months to maximize occupancy

4. **Operational Considerations**:
- Staffing and resource allocation could be optimized based on these cancellation patterns
- Inventory management should account for higher cancellation rates during peak months


### **Cancellation Rate by Lead Time**

In [None]:
# Create bins for lead time
df['lead_time_bins'] = pd.cut(df['lead_time'], 
                             bins=[0, 30, 60, 90, 120, 150, 180, 365, 730],
                             labels=['0-30', '31-60', '61-90', '91-120', 
                                    '121-150', '151-180', '181-365', '>365'])

# Plot cancellation rate by lead time bins
cancellation_rate = df.groupby('lead_time_bins',observed=False)['is_canceled'].mean()
cancellation_rate.plot(kind='bar', figsize=(12,6), color='#FFB347')
plt.title('Cancellation Rate by Lead Time')
plt.xlabel('Lead Time (days)')
plt.ylabel('Cancellation Rate')

### Analysis of Cancellation Rates by Lead Time

Before we analyse the chart lets better understand its feature: Cancellation Rate

Cancellation rate represents the percentage of bookings that are canceled out of the total bookings made. It's calculated as:
 
**Cancellation Rate = (Number of Canceled Bookings / Total Number of Bookings) * 100**

N.B In the bar graph its represented as (Number of Canceled Bookings / Total Number of Bookings)

This metric is crucial for hotels as it:
- Helps predict potential revenue loss
- Informs overbooking strategies
- Guides cancellation policy decisions
- Assists in resource allocation and planning
 
A higher cancellation rate indicates more uncertainty in bookings, while a lower rate suggests more reliable reservations.

The bar chart reveals important insights about the relationship between lead time and cancellation rates:

1. **General Trend**:
- Cancellation rates increase significantly as lead time increases
- Bookings made more than 365 days in advance have the highest cancellation rate
 
2. **Key Observations**:
- Short-term bookings (0-30 days) have the lowest cancellation rates (typically under 20%), likely because these are more urgent, planned trips
- The 31-60 days range shows a moderate increase in cancellations, suggesting this is when some travelers start reconsidering their plans
- Between 61-180 days, cancellation rates rise steadily, indicating that the longer the booking window, the more likely plans are to change
- The 181-365 days range shows a significant jump in cancellations, possibly due to changing personal circumstances or finding better deals elsewhere
- The >365 days category has the highest cancellation rate (over 50%), likely because plans made this far in advance are most susceptible to changes

3 and 4 are insights on how this bar chart analysis can help with market research and hotel campaign restructuring

3. **Business Implications**:
- The hotel could implement different cancellation policies based on lead time
- Early bookings might require larger deposits to reduce cancellations
- Marketing efforts could focus on last-minute bookings which have lower cancellation rates
 
4. **Operational Considerations**:
- Revenue management should account for higher cancellation rates in long-lead bookings
- Overbooking strategies could be adjusted based on lead time
- Customer communication should be intensified for long-lead bookings to maintain engagement



### **Cancellation Prediction Models**

In [92]:
df1 = df

In [93]:
df1 = pd.get_dummies(df1, columns=['arrival_date_month', 'meal', 'market_segment', 'distribution_channel', 'agent', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type'])


In [None]:
df1

In [95]:
# Label encode the country column
le = LabelEncoder()
df1['country'] = le.fit_transform(df1['country'])
df1['hotel'] = le.fit_transform(df1['hotel'])

In [None]:
df1

In [None]:
# Check if any columns are type category
category_columns = df1.select_dtypes(include=['category']).columns
print("Category columns: ", category_columns)

In [None]:
#Remove lead_time_bins
df1 = df1.select_dtypes(exclude=['category'])
df1

In [None]:
df1.info()

In [None]:
df1.columns

In [101]:
X = df1.drop(columns=['is_canceled','reservation_status','country', 
                     'reservation_status_date', 'arrival_date'])
y = df1['is_canceled']

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [103]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [104]:
def evaluate_classifier(y_true, y_pred, y_pred_proba, title):  
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    # Print metrics
    print(f"\n{title} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

#### **Random Forest Classifier**

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_scaled, y_train)
rf_pred = rf_clf.predict(X_test_scaled)
rf_pred_proba = rf_clf.predict_proba(X_test_scaled)[:, 1]
evaluate_classifier(y_test, rf_pred, rf_pred_proba, "Random Forest")

#### **Logistic Regression**

In [None]:
lr_clf = LogisticRegression(random_state=42, max_iter=1000) 
lr_clf.fit(X_train_scaled, y_train)
lr_pred = lr_clf.predict(X_test_scaled)
lr_pred_proba = lr_clf.predict_proba(X_test_scaled)[:, 1]
evaluate_classifier(y_test, lr_pred, lr_pred_proba, "Logistic Regression")

#### **Decision Tree**

In [None]:
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train_scaled, y_train)
dt_pred = dt_clf.predict(X_test_scaled)
dt_pred_proba = dt_clf.predict_proba(X_test_scaled)[:, 1]
evaluate_classifier(y_test, dt_pred, dt_pred_proba, "Decision Tree")

In [None]:
results = pd.DataFrame({
    'Classifier': ['Random Forest', 'Logistic Regression', 'Decision Tree'],
    'Accuracy': [accuracy_score(y_test, rf_pred), accuracy_score(y_test, lr_pred), accuracy_score(y_test, dt_pred)],
    'Precision': [precision_score(y_test, rf_pred), precision_score(y_test, lr_pred), precision_score(y_test, dt_pred)],
    'Recall': [recall_score(y_test, rf_pred), recall_score(y_test, lr_pred), recall_score(y_test, dt_pred)],
    'F1': [f1_score(y_test, rf_pred), f1_score(y_test, lr_pred), f1_score(y_test, dt_pred)]
})
results


#### **Model Results Analysis**

Based on the performance metrics shown above:
 
1. **Potential Overfitting**:  
Random Forest and Decision Tree show near-perfect scores (accuracy >99.9%) which may indicate overfitting. We should verify this by:
- Checking performance on training vs test sets
- Looking at feature importance distributions
- Considering regularization or hyperparameter tuning

2. **Class Balance**:  
Given the high precision/recall scores, we should confirm the class distribution in our target variable:


In [None]:
# Check class balance
print("Class distribution:\n", y.value_counts(normalize=True))

3. **Business Context Considerations**:  
- Recall is critical for cancellation prediction - we want to minimize false negatives
- Precision helps avoid false alarms, but may be less critical than recall
- The F1 score helps balance these priorities

4. **Model Comparison**:  
Let's visualize performance differences:

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
for i, (clf, name) in enumerate(zip([rf_clf, lr_clf, dt_clf], ['Random Forest', 'Logistic Regression', 'Decision Tree'])):
    cm = confusion_matrix(y_test, clf.predict(X_test_scaled))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Canceled', 'Canceled'])
    disp.plot(ax=axes[i], cmap='Blues')
    axes[i].set_title(f'{name} Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Analyze confusion matrices
print("\nConfusion Matrix Analysis:")

# Create a dictionary to store metrics
metrics = {}

# Calculate metrics for each model
for clf, name in zip([rf_clf, lr_clf, dt_clf], ['Random Forest', 'Logistic Regression', 'Decision Tree']):
    y_pred = clf.predict(X_test_scaled)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    # Store metrics
    metrics[name] = {
        'True Positives': tp,
        'False Positives': fp,
        'True Negatives': tn,
        'False Negatives': fn,
        'Precision': tp / (tp + fp),
        'Recall': tp / (tp + fn)
    }
    
    # Print insights
    print(f"\n{name} Performance:")
    print(f"- Correct predictions: {tp + tn} ({((tp + tn)/len(y_test))*100:.1f}%)")
    print(f"- False positives: {fp} (Type I errors)")
    print(f"- False negatives: {fn} (Type II errors)")
    print(f"- Recall (ability to catch cancellations): {metrics[name]['Recall']:.3f}")
    print(f"- Precision (accuracy of cancellation predictions): {metrics[name]['Precision']:.3f}")

# Compare models
print("\nModel Comparison Insights:")
print("- Random Forest and Decision Tree show near-perfect performance with minimal false negatives")
print("- Logistic Regression has higher false negatives, indicating it's less effective at catching cancellations")
print("- All models have very low false positives, meaning they rarely predict cancellations incorrectly")
print("- The high recall scores suggest all models are effective at identifying actual cancellations")
print("- The near-perfect precision indicates very few false alarms in cancellation predictions")

5. **Feature Analysis**:  
Let's examine the most important features for the best-performing model:

In [None]:
# Get feature importances
rf_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_clf.feature_importances_
}).sort_values('importance', ascending=False).head(10)

# Plot top features
plt.figure(figsize=(10, 6))
sns.barplot(data=rf_importances, x='importance', y='feature')
plt.title('Top 10 Random Forest Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

The bar chart above represents the top 10 most important features used by the Random Forest model to predict hotel booking cancellations. The importance of a feature is determined by how much the tree nodes, which use that feature, reduce impurity across all trees in the forest. The feature with the highest importance score contributes the most to the model's decision-making process. By understanding these feature importances, we can gain insights into which factors are most influential in predicting whether a hotel booking will be cancelled.