<a href="https://colab.research.google.com/github/NairaMo/Hotel-Booking-Analysis-Classification/blob/main/Highest_Accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset
df= pd.read_csv("/content/first inten project - first inten project (1).csv")
df.head()


Unnamed: 0,Booking_ID,number of adults,number of children,number of weekend nights,number of week nights,type of meal,car parking space,room type,lead time,market segment type,repeated,P-C,P-not-C,average price,special requests,date of reservation,booking status
0,INN00001,1,1,2,5,Meal Plan 1,0,Room_Type 1,224,Offline,0,0,0,88.0,0,10/2/2015,Not_Canceled
1,INN00002,1,0,1,3,Not Selected,0,Room_Type 1,5,Online,0,0,0,106.68,1,11/6/2018,Not_Canceled
2,INN00003,2,1,1,3,Meal Plan 1,0,Room_Type 1,1,Online,0,0,0,50.0,0,2/28/2018,Canceled
3,INN00004,1,0,0,2,Meal Plan 1,0,Room_Type 1,211,Online,0,0,0,100.0,1,5/20/2017,Canceled
4,INN00005,1,0,1,2,Not Selected,0,Room_Type 1,48,Online,0,0,0,77.0,0,4/11/2018,Canceled


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Separate the features (X) and target (y)
X = df.drop('booking status', axis=1)  # Replace 'Booking Status' with the actual target column name
y = df['booking status']

# Drop 'Booking_ID' since it's not useful for model training
X = X.drop('Booking_ID', axis=1)

# Convert 'date of reservation' to datetime and extract year, month, day
X['date of reservation'] = pd.to_datetime(X['date of reservation'], errors='coerce')
X['reservation_year'] = X['date of reservation'].dt.year
X['reservation_month'] = X['date of reservation'].dt.month
X['reservation_day'] = X['date of reservation'].dt.day

# Drop the original 'date of reservation' column after extracting useful features
X = X.drop('date of reservation', axis=1)

# OneHotEncode the categorical columns ('type of meal', 'room type', 'market segment type')
categorical_cols = ['type of meal', 'room type', 'market segment type']  # Adjust with the correct column names

# Initialize OneHotEncoder
# Use sparse_output instead of sparse, as sparse is deprecated
onehot = OneHotEncoder(sparse_output=False, drop='first')  # Drop first to avoid multicollinearity

# One-hot encode the categorical columns
X_encoded = pd.DataFrame(onehot.fit_transform(X[categorical_cols]), columns=onehot.get_feature_names_out(categorical_cols))

# Concatenate the encoded columns with the rest of the dataframe
X = pd.concat([X.drop(categorical_cols, axis=1), X_encoded], axis=1)

In [None]:
# Impute missing values for numerical columns using the median
imputer = SimpleImputer(strategy='median')

# Apply imputation to the entire dataset
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Check the first few rows after preprocessing
print(X_imputed.head())


   number of adults  number of children  number of weekend nights  \
0               1.0                 1.0                       2.0   
1               1.0                 0.0                       1.0   
2               2.0                 1.0                       1.0   
3               1.0                 0.0                       0.0   
4               1.0                 0.0                       1.0   

   number of week nights  car parking space  lead time  repeated  P-C  \
0                    5.0                0.0      224.0       0.0  0.0   
1                    3.0                0.0        5.0       0.0  0.0   
2                    3.0                0.0        1.0       0.0  0.0   
3                    2.0                0.0      211.0       0.0  0.0   
4                    2.0                0.0       48.0       0.0  0.0   

   P-not-C  average price  ...  room type_Room_Type 2  room type_Room_Type 3  \
0      0.0          88.00  ...                    0.0             

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define the model
rf_model = RandomForestClassifier()

# Set up the hyperparameter grid
param_dist = {
    'n_estimators': [200, 400, 600],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform RandomizedSearchCV to find the best parameters
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=10, cv=3, random_state=42)
random_search.fit(X_train, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_
print("Best hyperparameters:", random_search.best_params_)


Best hyperparameters: {'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}


In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")


Test set accuracy: 0.9053
