In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, roc_auc_score
from datetime import datetime
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
hotel = pd.read_csv("R2_final_hotels.csv")
festival = pd.read_excel('Festival.xlsx')

In [3]:
hotel = hotel.drop(columns=['BookingCompany', 'TravelAgent'])
hotel.dropna(inplace=True)

In [4]:
hotel['ArrivalDate'] = pd.to_datetime(hotel['ArrivalDate'])
hotel['ReservationStatusDate'] = pd.to_datetime(hotel['ReservationStatusDate'])

hotel['DaysBeforeCancellation'] = (hotel['ArrivalDate'] - hotel['ReservationStatusDate']).dt.days
hotel = hotel[hotel['DaysBeforeCancellation'] >= 0]

In [5]:
#TotalRevenue
hotel['TotalRevenue'] = hotel['AverageDailyRate'] * (hotel['StaysInWeekendNights'] + hotel['StaysInWeekNights'])

In [6]:
Upgrade = {
    'A': 2, 
    'B': 2, 
    'C': 2, 
    'D': 2, 
    'E': 1,
    'F': 1, 
    'G': 1
}

hotel['AssignedRoom'] = hotel['AssignedRoom'].map(Upgrade)
hotel['ReservedRoom'] = hotel['ReservedRoom'].map(Upgrade)

def decide_whether_upgrade(row):
    if row['AssignedRoom'] > row['ReservedRoom']:
        return 1
    else:
        return 0

hotel["Upgrade"] = hotel.apply(decide_whether_upgrade, axis=1)

In [7]:
numerical_columns = hotel.select_dtypes(include=['int64', 'float64'])
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputed_data = imputer.fit_transform(numerical_columns)

numerical_columns_imputed = pd.DataFrame(imputed_data, columns=numerical_columns.columns)

kmeans = KMeans(n_clusters=5, random_state=42)

# Fit the KMeans model on the imputed DataFrame
kmeans.fit(numerical_columns_imputed)

# Retrieve the cluster labels
clusters = kmeans.labels_

# Add the cluster labels to the DataFrame
numerical_columns_imputed['Cluster'] = clusters

# Print the mean values of numerical columns for each cluster
print(numerical_columns_imputed.groupby('Cluster').mean())



         ArrivalWeekNumber  AssignedRoom  AverageDailyRate  BookingChanges  \
Cluster                                                                      
0                28.106299      1.673228        170.460394        0.291339   
1                29.729617      1.971352         94.519886        0.116269   
2                32.870229      1.995759         80.058130        0.041455   
3                29.710581      1.937547        122.025289        0.142416   
4                30.149304      1.815066        152.285714        0.192060   

         BookingParking  DaysInWaitingList    LeadTime  NumberOfAdult  \
Cluster                                                                 
0              0.015748           0.000000   90.153543       1.968504   
1              0.009570           2.487323   75.735459       1.774298   
2              0.001272           6.495971  326.820929       1.933100   
3              0.005524           1.489139  112.125468       1.938670   
4              

In [8]:
numerical_columns_imputed['Upgrade'] = numerical_columns_imputed['Upgrade'].astype(bool)

In [9]:
def run_logistic_regression_for_each_segment_and_get_coefficients(data, segment_col, target_col, features):
    scaler = StandardScaler()
    segments = data[segment_col].unique()
    results = {}
    coefficients = {}
    
    for segment in segments:
        print(f"Running Logistic Regression for {segment}")
        
        segment_data = data[data[segment_col] == segment]
        X = segment_data[features]
        y = segment_data[target_col]
        
        X_scaled = scaler.fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

        unique_classes = np.unique(y_train)
        if len(unique_classes) < 2:
            print(f"Skipping model training for {segment}: Only one class present in training set.")
            results[segment] = {'Accuracy': 'N/A', 'ROC AUC': 'N/A'}
            coefficients[segment] = 'N/A'
            continue  # This continue should be inside the for loop
        
        model = LogisticRegression()
        model.fit(X_train, y_train)
        
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        
        # Check if both classes are present in y_test
        if len(np.unique(y_test)) < 2:
            roc_auc = "Not defined (single class)"
        else:
            roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        
        # Store results
        results[segment] = {'Accuracy': accuracy, 'ROC AUC': roc_auc}
        
        # Store coefficients
        coefficients[segment] = dict(zip(features, model.coef_[0]))
        
        print(f"Results for {segment}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc}\n")
    
    return results, coefficients

In [10]:
features = [col for col in numerical_columns_imputed.columns if col not in ['Cluster', 'Upgrade']]
results, coeffs = run_logistic_regression_for_each_segment_and_get_coefficients(numerical_columns_imputed, 'Cluster', 'Upgrade', features)

for segment, coefs in coeffs.items():
    print(f"Segment {segment} influential factors for an upgrade:")
    sorted_coefs = sorted(coefs.items(), key=lambda x: abs(x[1]), reverse=True)
    for feature, coef in sorted_coefs:
        print(f"{feature}: {coef:.4f}")
    print("\n")

Running Logistic Regression for 1
Results for 1: Accuracy = 1.0000, ROC AUC = 1.0

Running Logistic Regression for 3
Results for 3: Accuracy = 1.0000, ROC AUC = 1.0

Running Logistic Regression for 2
Results for 2: Accuracy = 1.0000, ROC AUC = Not defined (single class)

Running Logistic Regression for 4
Results for 4: Accuracy = 1.0000, ROC AUC = 1.0

Running Logistic Regression for 0
Skipping model training for 0: Only one class present in training set.
Segment 1 influential factors for an upgrade:
ReservedRoom: -1.8656
AssignedRoom: 1.5666
AverageDailyRate: 0.1704
StaysInWeekendNights: -0.1379
DaysBeforeCancellation: -0.1245
PreviousBookingsNotCanceled: 0.1238
TotalRevenue: -0.1136
BookingParking: -0.0759
StaysInWeekNights: 0.0662
NumberOfChildren: -0.0647
NumberOfBabies: -0.0507
LeadTime: -0.0483
NumberOfAdult: 0.0477
TotalOfSpecialRequests: 0.0387
RepeatedGuest: 0.0381
DaysInWaitingList: -0.0375
ArrivalWeekNumber: 0.0351
PreviousCancellations: 0.0345
BookingChanges: -0.0091


Segm

AttributeError: 'str' object has no attribute 'items'