# Data import

In [103]:
import pandas as pd

# Carregar os dados
df = pd.read_csv("downloads/Hotel_Reservations.csv")

# Visualizar as primeiras linhas do dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date   

### Mudando nome da coluna Booking_ID
Por algum motivo, o formato de escrita do nome dessa coluna da erro.

In [104]:
df.rename(columns={df.columns[0]: 'booking_id'}, inplace=True)

In [105]:
df['booking_id'].head()

0    INN00001
1    INN00002
2    INN00003
3    INN00004
4    INN00005
Name: booking_id, dtype: object

# Cleaning data

### Converting the first column from object to string

In [106]:
# Convert the 'booking_id' column to string
df['booking_id'] = df['booking_id'].astype(str)

# Verify the data type
print(df['booking_id'].dtype)

object


In [107]:
# Remover as letras "INN" dos valores da coluna booking_id
df['booking_id'] = df['booking_id'].str.replace('INN', '', regex=False)


# Visualizar o DataFrame atualizado
print(df['booking_id'])

0        00001
1        00002
2        00003
3        00004
4        00005
         ...  
36270    36271
36271    36272
36272    36273
36273    36274
36274    36275
Name: booking_id, Length: 36275, dtype: object


# Data exploration

In [108]:
# Checar valores ausentes
print(df.isnull().sum())

# Preencher ou remover valores ausentes
df.fillna(0, inplace=True)

# Converter apenas as colunas selecionadas para variáveis numéricas
target_columns = ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type']
dummies = pd.get_dummies(df[target_columns], drop_first=False)

# Concatenar os dummies de volta ao DataFrame original
df = pd.concat([df, dummies], axis=1)

# Remover as colunas originais, se desejado
df.drop(columns=target_columns, inplace=True)

# Confirmar limpeza
print(df.info())

booking_id                              0
no_of_adults                            0
no_of_children                          0
no_of_weekend_nights                    0
no_of_week_nights                       0
type_of_meal_plan                       0
required_car_parking_space              0
room_type_reserved                      0
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
market_segment_type                     0
repeated_guest                          0
no_of_previous_cancellations            0
no_of_previous_bookings_not_canceled    0
avg_price_per_room                      0
no_of_special_requests                  0
booking_status                          0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 32 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------  

# Separating Features and Targes

In [109]:
X = df.drop('booking_status', axis=1)
y = df['booking_status']

# Export X (features) to a CSV file
X.to_csv('data/features.csv', index=False)

# Export y (target) to a CSV file
y.to_csv('data/target.csv', index=False, header=True)

print("X and y have been exported as 'features.csv' and 'target.csv'.")

X and y have been exported as 'features.csv' and 'target.csv'.
