
# Importing Libraries
---



In [1]:
import numpy as np
import pandas as pd

In [2]:

import os
import shutil
import kagglehub

path = kagglehub.dataset_download("jessemostipak/hotel-booking-demand")

print("Path to dataset files:", path)

source_path = path
destination_path = '/content/'

if not os.path.exists(destination_path):
    os.makedirs(destination_path)

shutil.move(source_path, destination_path)

print(f"Data moved from {source_path} to {destination_path}")


Path to dataset files: /kaggle/input/hotel-booking-demand


OSError: [Errno 30] Read-only file system: 'hotel_bookings.csv'

In [4]:
# Load the dataset using pandas
df = pd.read_csv("/content/hotel-booking-demand/hotel_bookings.csv")

In [5]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


# **Data Understanding**

# Data Shape

In [6]:
df.shape

(119390, 32)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

# Statistical Information

In [8]:
df.describe(include='all')


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
count,119390,119390.0,119390.0,119390.0,119390,119390.0,119390.0,119390.0,119390.0,119390.0,...,119390,103050.0,6797.0,119390.0,119390,119390.0,119390.0,119390.0,119390,119390
unique,2,,,,12,,,,,,...,3,,,,4,,,,3,926
top,City Hotel,,,,August,,,,,,...,No Deposit,,,,Transient,,,,Check-Out,2015-10-21
freq,79330,,,,13877,,,,,,...,104641,,,,89613,,,,75166,1461
mean,,0.370416,104.011416,2016.156554,,27.165173,15.798241,0.927599,2.500302,1.856403,...,,86.693382,189.266735,2.321149,,101.831122,0.062518,0.571363,,
std,,0.482918,106.863097,0.707476,,13.605138,8.780829,0.998613,1.908286,0.579261,...,,110.774548,131.655015,17.594721,,50.53579,0.245291,0.792798,,
min,,0.0,0.0,2015.0,,1.0,1.0,0.0,0.0,0.0,...,,1.0,6.0,0.0,,-6.38,0.0,0.0,,
25%,,0.0,18.0,2016.0,,16.0,8.0,0.0,1.0,2.0,...,,9.0,62.0,0.0,,69.29,0.0,0.0,,
50%,,0.0,69.0,2016.0,,28.0,16.0,1.0,2.0,2.0,...,,14.0,179.0,0.0,,94.575,0.0,0.0,,
75%,,1.0,160.0,2017.0,,38.0,23.0,2.0,3.0,2.0,...,,229.0,270.0,0.0,,126.0,0.0,1.0,,


# **Data Preprocessing**

# Duplicated Row

In [9]:
df = df.drop_duplicates()

In [10]:
print(df.shape)

(87396, 32)


# Missing Values

In [11]:
df.isnull().sum()

Unnamed: 0,0
hotel,0
is_canceled,0
lead_time,0
arrival_date_year,0
arrival_date_month,0
arrival_date_week_number,0
arrival_date_day_of_month,0
stays_in_weekend_nights,0
stays_in_week_nights,0
adults,0


In [12]:

# تعويض 'children' بالوسيط
df['children'].fillna(df['children'].median(), inplace=True)

# تعويض 'country' بالقيمة الأكثر تكراراً
df['country'].fillna(df['country'].mode()[0], inplace=True)

# تعويض 'agent' بـ 0 (أو "Unknown" لو كانت من نوع object)
df['agent'].fillna(0, inplace=True)

# حذف عمود 'company'
df.drop('company', axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['children'].fillna(df['children'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['children'].fillna(df['children'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[c

In [13]:
df.isnull().sum()

Unnamed: 0,0
hotel,0
is_canceled,0
lead_time,0
arrival_date_year,0
arrival_date_month,0
arrival_date_week_number,0
arrival_date_day_of_month,0
stays_in_weekend_nights,0
stays_in_week_nights,0
adults,0


# **Label Encoding**

In [14]:
for col in df.columns:
    print(f"العمود: {col}")
    if  df[col].name == 'reservation_status_date' :
      continue
    if df[col].dtype == 'object' :
        print("↳ نوع العمود: تصنيفي (Categorical)")
        print(f"↳ الفئات: {df[col].unique()}")
    else:
        print("↳ نوع العمود: رقمي (Numeric)")
    print("-" * 50)


العمود: hotel
↳ نوع العمود: تصنيفي (Categorical)
↳ الفئات: ['Resort Hotel' 'City Hotel']
--------------------------------------------------
العمود: is_canceled
↳ نوع العمود: رقمي (Numeric)
--------------------------------------------------
العمود: lead_time
↳ نوع العمود: رقمي (Numeric)
--------------------------------------------------
العمود: arrival_date_year
↳ نوع العمود: رقمي (Numeric)
--------------------------------------------------
العمود: arrival_date_month
↳ نوع العمود: تصنيفي (Categorical)
↳ الفئات: ['July' 'August' 'September' 'October' 'November' 'December' 'January'
 'February' 'March' 'April' 'May' 'June']
--------------------------------------------------
العمود: arrival_date_week_number
↳ نوع العمود: رقمي (Numeric)
--------------------------------------------------
العمود: arrival_date_day_of_month
↳ نوع العمود: رقمي (Numeric)
--------------------------------------------------
العمود: stays_in_weekend_nights
↳ نوع العمود: رقمي (Numeric)
--------------------------------

In [15]:
from sklearn.preprocessing import LabelEncoder

# 1. نسخ البيانات
df_encoded = df.copy()

# 2. الأعمدة التي سنستخدم لها Label Encoding
label_cols = [
    'arrival_date_month',
    'country',
    'market_segment',
    'distribution_channel',
    'reserved_room_type',
    'assigned_room_type'
]

# 3. الأعمدة التي سنستخدم لها One-Hot Encoding
onehot_cols = [
    'hotel',
    'meal',
    'deposit_type',
    'customer_type',
    'reservation_status'
]

# 4. Label Encoding
le = LabelEncoder()
for col in label_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

# 5. One-Hot Encoding
df_encoded = pd.get_dummies(df_encoded, columns=onehot_cols, drop_first=True)

# 6. استخراج اليوم والشهر والسنة من reservation_status_date
df_encoded['reservation_status_date'] = pd.to_datetime(df_encoded['reservation_status_date'])
df_encoded['reservation_status_date_year'] = df_encoded['reservation_status_date'].dt.year
df_encoded['reservation_status_date_month'] = df_encoded['reservation_status_date'].dt.month
df_encoded['reservation_status_date_day'] = df_encoded['reservation_status_date'].dt.day

# (اختياري) حذف العمود الأصلي للتاريخ إذا لم تعد بحاجة له
df_encoded.drop('reservation_status_date', axis=1, inplace=True)

# 7. النتيجة
print("✅ Encoding complete. Data shape:", df_encoded.shape)


✅ Encoding complete. Data shape: (87396, 40)


In [16]:
# تحويل القيم Boolean إلى 0 و 1
df_encoded = df_encoded.replace({True: 1, False: 0})


  df_encoded = df_encoded.replace({True: 1, False: 0})


In [17]:
# عرض كل الأعمدة
pd.set_option('display.max_columns', None)

# عرض أول 20 صف
df_encoded.head(20)


Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,hotel_Resort Hotel,meal_FB,meal_HB,meal_SC,meal_Undefined,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,reservation_status_Check-Out,reservation_status_No-Show,reservation_status_date_year,reservation_status_date_month,reservation_status_date_day
0,0,342,2015,5,27,1,0,0,2,0.0,0,135,3,1,0,0,0,2,2,3,0.0,0,0.0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,2015,7,1
1,0,737,2015,5,27,1,0,0,2,0.0,0,135,3,1,0,0,0,2,2,4,0.0,0,0.0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,2015,7,1
2,0,7,2015,5,27,1,0,1,1,0.0,0,59,3,1,0,0,0,0,2,0,0.0,0,75.0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,2015,7,2
3,0,13,2015,5,27,1,0,1,1,0.0,0,59,2,0,0,0,0,0,0,0,304.0,0,75.0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,2015,7,2
4,0,14,2015,5,27,1,0,2,2,0.0,0,59,6,3,0,0,0,0,0,0,240.0,0,98.0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,2015,7,3
6,0,0,2015,5,27,1,0,2,2,0.0,0,135,3,1,0,0,0,2,2,0,0.0,0,107.0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,2015,7,3
7,0,9,2015,5,27,1,0,2,2,0.0,0,135,3,1,0,0,0,2,2,0,303.0,0,103.0,0,1,1,1,0,0,0,0,0,0,1,0,1,0,2015,7,3
8,1,85,2015,5,27,1,0,3,2,0.0,0,135,6,3,0,0,0,0,0,0,240.0,0,82.0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,2015,5,6
9,1,75,2015,5,27,1,0,3,2,0.0,0,135,5,3,0,0,0,3,3,0,15.0,0,105.5,0,0,1,0,1,0,0,0,0,0,1,0,0,0,2015,4,22
10,1,23,2015,5,27,1,0,4,2,0.0,0,135,6,3,0,0,0,4,4,0,240.0,0,123.0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2015,6,23
