In [2]:
import pandas as pd

In [3]:
# merging the flight and hotel dataset:

# Load the datasets
flights = pd.read_excel(r"D:\Make_my_trip\FinalDataset\FlightFINALdataset.xlsx")
hotels = pd.read_excel(r"D:\Make_my_trip\FinalDataset\HotelFINALdataset.xlsx")

# Merge on a common column (e.g., 'user_id' or 'travelCode')
merged_flights_hotels = flights.merge(hotels, on="travelCode", how="inner")

# Save the merged dataset
merged_flights_hotels.to_csv("flights_hotels.csv", index=False)

# Display the first few rows
print(merged_flights_hotels.head())


   travelCode  User_ID_x      Departure             Arrival  flightType  \
0           0          0    Recife (PE)  Florianopolis (SC)  firstClass   
1           2          0   Aracaju (SE)       Salvador (BH)  firstClass   
2           7          0   Aracaju (SE)       Salvador (BH)    economic   
3          11          0  Brasilia (DF)       Salvador (BH)     premium   
4          13          0    Recife (PE)  Florianopolis (SC)  firstClass   

   Flight_price  Flight_duration  Flight_Distance Flight_agency  \
0       1434.38             1.76           676.53   FlyingDrops   
1       1684.05             2.16           830.86       CloudFy   
2        964.83             2.16           830.86       CloudFy   
3       1268.97             1.76           676.56       Rainbow   
4       1434.38             1.76           676.53   FlyingDrops   

  Departure_date  User_ID_y Hotel_Name       Arrival_place  Hotel_stay  \
0     26/09/2019          0    Hotel A  Florianopolis (SC)           4  

In [4]:

# Load datasets
flights_hotels = pd.read_csv("flights_hotels.csv")
cars = pd.read_excel(r"D:\Make_my_trip\FinalDataset\CarFINALdataset.xlsx")

# Merge datasets on user_id or travelCode
merged_data = flights_hotels.merge(cars, on="travelCode", how="left", indicator=True)

# Label users who booked a car (1) and those who didn't (0)
merged_data["car_booking"] = (merged_data["_merge"] == "both").astype(int)

# Drop unnecessary columns
merged_data.drop(columns=["_merge"], inplace=True)

# Check class balance
print(merged_data["car_booking"].value_counts())


car_booking
0    36779
1     3773
Name: count, dtype: int64


## Defining the training and recommendation sets:

Training set:

~ contains the users who have booked a car 

~ plus an equal number of randomly selected users who did not book a car.

~ this will balance the dataset and prevent the model bias.

Recommendation Set:

~ contains the remaining users who booked flight and hotel but not a car.

~ the model will predict if they are likley to book a car.


In [5]:
# Load the merged flights-hotels-car dataset
merged_data = pd.read_csv("flights_hotels.csv")

# Load car booking data (contains users who booked a car)
cars = pd.read_excel(r"D:\Make_my_trip\FinalDataset\CarFINALdataset.xlsx")

# Merge to identify users who booked cars
merged_data = merged_data.merge(cars, on="travelCode", how="left", indicator=True)

# Label users who booked a car (1) and those who didn't (0)
merged_data["car_booking"] = (merged_data["_merge"] == "both").astype(int)
merged_data.drop(columns=["_merge"], inplace=True)

# Split into car bookers (1) and non-car bookers (0)
booked_cars = merged_data[merged_data["car_booking"] == 1]
not_booked_cars = merged_data[merged_data["car_booking"] == 0]

# Select equal number of non-car bookings (to balance training set)
not_booked_sample = not_booked_cars.sample(n=len(booked_cars), random_state=42)

# Training Data: Equal number of users who booked and didn't book a car
train_data = pd.concat([booked_cars, not_booked_sample])

# Recommendation Data: Remaining users who did not book a car
recommendation_data = not_booked_cars.drop(not_booked_sample.index)

# Save datasets
train_data.to_csv("train_data.csv", index=False)
recommendation_data.to_csv("recommendation_data.csv", index=False)

# Check dataset shapes
print("Training Set Shape:", train_data.shape)
print("Recommendation Set Shape:", recommendation_data.shape)


Training Set Shape: (7546, 29)
Recommendation Set Shape: (33006, 29)


In [6]:
train_data.columns

Index(['travelCode', 'User_ID_x', 'Departure', 'Arrival', 'flightType',
       'Flight_price', 'Flight_duration', 'Flight_Distance', 'Flight_agency',
       'Departure_date', 'User_ID_y', 'Hotel_Name', 'Arrival_place',
       'Hotel_stay', 'Hotel_per_day_price', 'Check-in_x', 'Hotel_TotalPrice',
       'User_ID', 'Check-in_y', 'pickupLocation', 'dropoffLocation', 'carType',
       'rentalAgency', 'rentalDuration', 'Car_total_distance', 'fuelPolicy',
       'Car_bookingStatus', 'total_rent_price', 'car_booking'],
      dtype='object')

In [7]:
train_data= train_data.drop(columns=['User_ID_y', 'Check-in_y'],axis=1)

In [8]:
recommendation_data= recommendation_data.drop(columns=['User_ID_y', 'Check-in_y'],axis=1)

In [9]:
train_data=train_data.drop(columns=['User_ID'],axis=1)

In [10]:
recommendation_data=recommendation_data.drop(columns=['User_ID'],axis=1)

In [11]:
train_data.columns

Index(['travelCode', 'User_ID_x', 'Departure', 'Arrival', 'flightType',
       'Flight_price', 'Flight_duration', 'Flight_Distance', 'Flight_agency',
       'Departure_date', 'Hotel_Name', 'Arrival_place', 'Hotel_stay',
       'Hotel_per_day_price', 'Check-in_x', 'Hotel_TotalPrice',
       'pickupLocation', 'dropoffLocation', 'carType', 'rentalAgency',
       'rentalDuration', 'Car_total_distance', 'fuelPolicy',
       'Car_bookingStatus', 'total_rent_price', 'car_booking'],
      dtype='object')

In [12]:
recommendation_data.columns

Index(['travelCode', 'User_ID_x', 'Departure', 'Arrival', 'flightType',
       'Flight_price', 'Flight_duration', 'Flight_Distance', 'Flight_agency',
       'Departure_date', 'Hotel_Name', 'Arrival_place', 'Hotel_stay',
       'Hotel_per_day_price', 'Check-in_x', 'Hotel_TotalPrice',
       'pickupLocation', 'dropoffLocation', 'carType', 'rentalAgency',
       'rentalDuration', 'Car_total_distance', 'fuelPolicy',
       'Car_bookingStatus', 'total_rent_price', 'car_booking'],
      dtype='object')

In [13]:
train_data.rename(columns={'User_ID_x':'User_ID','Check-in_x':'Check_in_Hotel'},inplace=True)

In [14]:
recommendation_data.rename(columns={'User_ID_x':'User_ID','Check-in_x':'Check_in_Hotel'},inplace=True)

In [15]:
train_data.columns.tolist()

['travelCode',
 'User_ID',
 'Departure',
 'Arrival',
 'flightType',
 'Flight_price',
 'Flight_duration',
 'Flight_Distance',
 'Flight_agency',
 'Departure_date',
 'Hotel_Name',
 'Arrival_place',
 'Hotel_stay',
 'Hotel_per_day_price',
 'Check_in_Hotel',
 'Hotel_TotalPrice',
 'pickupLocation',
 'dropoffLocation',
 'carType',
 'rentalAgency',
 'rentalDuration',
 'Car_total_distance',
 'fuelPolicy',
 'Car_bookingStatus',
 'total_rent_price',
 'car_booking']

# Prepare the data for the likelihood prediction model

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# First, separate users who booked cars vs those who didn't
car_bookers = train_data[train_data['car_booking'] == 1]
non_car_bookers = train_data[train_data['car_booking'] == 0]

# Randomly select 3773 users who booked cars
if len(car_bookers) >3772:
    car_bookers = car_bookers.sample(n=3773, random_state=42)
else:
    print(f"Warning: Only {len(car_bookers)} users who booked cars available")

# Randomly select 3773 users who didn't book cars
non_car_bookers_sample = non_car_bookers.sample(n=3773, random_state=42)

# Create balanced training dataset
training_data = pd.concat([car_bookers, non_car_bookers_sample])

# The rest of non-car bookers become your recommendation set
recommendation_set = non_car_bookers.drop(non_car_bookers_sample.index)

# Drop irrelevant columns as identified earlier
columns_to_drop = [
    'travelCode',
    'Departure',
    'Arrival',
    'flightType',
    'Hotel_Name',
    'Check_in_Hotel',
    'pickupLocation',
    'dropoffLocation',
    'carType',
    'rentalAgency',
    'rentalDuration',
    'Car_total_distance',
    'fuelPolicy',
    'Car_bookingStatus',
    'total_rent_price'
]

training_data_cleaned = train_data.drop(columns=columns_to_drop)
recommendation_set_cleaned = recommendation_data.drop(columns=columns_to_drop)

# Feature preparation and splitting training data

In [17]:
# Define features and target for training
X = training_data_cleaned.drop(columns=['car_booking', 'User_ID'])  # Remove target and ID
y = training_data_cleaned['car_booking']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
scaler = StandardScaler()
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Handle categorical features if any (you may need to one-hot encode them)
# For example, Flight_agency might be categorical

# Train the likelihood prediction model

In [18]:
# First, let's identify categorical columns in your dataset
categorical_cols = X_train.select_dtypes(include=['object']).columns
print(f"Categorical columns that need encoding: {list(categorical_cols)}")

# Now, let's use one-hot encoding for categorical variables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Create a pipeline with preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline on the raw data (without previous preprocessing)
X_train_raw = training_data_cleaned.drop(columns=['car_booking', 'User_ID'])
y_train_raw = training_data_cleaned['car_booking']
X_test_raw = X_test.copy() if 'X_test' in locals() else None

# Fit the pipeline
pipeline.fit(X_train_raw, y_train_raw)

# If you have a test set
if X_test_raw is not None:
    y_pred = pipeline.predict(X_test_raw)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

Categorical columns that need encoding: ['Flight_agency', 'Departure_date', 'Arrival_place']
[[391 364]
 [351 404]]
              precision    recall  f1-score   support

           0       0.53      0.52      0.52       755
           1       0.53      0.54      0.53       755

    accuracy                           0.53      1510
   macro avg       0.53      0.53      0.53      1510
weighted avg       0.53      0.53      0.53      1510



#  Improve the model performance

In [19]:
# 1. Try tuning the hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X_train_raw, y_train_raw)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Use the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_raw)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 2. Feature importance
# Get feature names after one-hot encoding
feature_names = (
    numerical_cols.tolist() +
    best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols).tolist()
)

# Get importances
importances = best_model.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("Top 10 most important features:")
print(feature_importance.head(10))

Best parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Best cross-validation score: 0.2968
[[392 363]
 [377 378]]
              precision    recall  f1-score   support

           0       0.51      0.52      0.51       755
           1       0.51      0.50      0.51       755

    accuracy                           0.51      1510
   macro avg       0.51      0.51      0.51      1510
weighted avg       0.51      0.51      0.51      1510

Top 10 most important features:
                      Feature  Importance
0                Flight_price    0.100699
5            Hotel_TotalPrice    0.048846
2             Flight_Distance    0.042694
1             Flight_duration    0.041362
3                  Hotel_stay    0.034695
4         Hotel_per_day_price    0.016698
6       Flight_agency_CloudFy    0.015199
8       Flight_agency_Rainbow    0.014994
7   Flight_agency_FlyingDrops    0.010118
13  Depar

In [20]:
# Prepare the recommendation set (same preprocessing as training data)
X_recommendation = recommendation_set_cleaned.drop(columns=['car_booking'])
user_ids = recommendation_set_cleaned['User_ID']

# Make predictions
prediction_proba = best_model.predict_proba(X_recommendation)
car_booking_likelihood = prediction_proba[:, 1]  # Probability of class 1 (booking a car)

# Create recommendation dataframe
recommendations = pd.DataFrame({
    'User_ID': user_ids,
    'car_booking_likelihood': car_booking_likelihood
})

# Sort by likelihood (highest first)
recommendations = recommendations.sort_values('car_booking_likelihood', ascending=False)

# Select users above a certain threshold (e.g., 0.7) for car recommendations
threshold = 0.7
likely_car_bookers = recommendations[recommendations['car_booking_likelihood'] >= threshold]

print(f"Number of users with car booking likelihood >= {threshold}: {len(likely_car_bookers)}")
print(likely_car_bookers.head())

Number of users with car booking likelihood >= 0.7: 3902
       User_ID  car_booking_likelihood
39454     1303                0.976667
4694       155                0.976667
6457       209                0.976667
5038       166                0.966667
9483       300                0.966667


# Prepare data for car type recommendation model

In [21]:
# First, get historical data of users who actually booked cars
car_bookers_data = merged_data[merged_data['car_booking'] == 1].copy()

# Check what car types are available
car_types = car_bookers_data['carType'].value_counts()
print("Distribution of car types booked:")
print(car_types)

# Features that might influence car type choice
car_type_features = [
    'Flight_Distance', 
    'Flight_duration',
    'Hotel_stay',
    'Hotel_per_day_price',  # Indicator of budget/luxury preference
    'Arrival_place',        # Different locations might require different cars
    'Flight_price'          # Another budget indicator
]

# Prepare features and target for car type prediction
X_car_type = car_bookers_data[car_type_features]
y_car_type = car_bookers_data['carType']

# Handle categorical features (like Arrival_place)
categorical_cols_car = X_car_type.select_dtypes(include=['object']).columns
numerical_cols_car = X_car_type.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessor for car type model
car_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols_car),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_car)
    ])

# Split the data
X_train_car, X_test_car, y_train_car, y_test_car = train_test_split(
    X_car_type, y_car_type, test_size=0.2, random_state=42, stratify=y_car_type
)

Distribution of car types booked:
carType
Sedan        990
Hatchback    939
Luxury       925
SUV          919
Name: count, dtype: int64


# Train the car type recommendation model

In [22]:
from sklearn.ensemble import RandomForestClassifier

# Create pipeline
car_type_pipeline = Pipeline(steps=[
    ('preprocessor', car_preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train model
car_type_pipeline.fit(X_train_car, y_train_car)

# Evaluate
y_pred_car = car_type_pipeline.predict(X_test_car)
print("\nCar Type Recommendation Model Performance:")
print(classification_report(y_test_car, y_pred_car))

# Show feature importance for car type prediction
car_feature_names = (
    numerical_cols_car.tolist() +
    car_type_pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols_car).tolist()
)
car_importances = car_type_pipeline.named_steps['classifier'].feature_importances_
car_feature_importance = pd.DataFrame({
    'Feature': car_feature_names,
    'Importance': car_importances
}).sort_values('Importance', ascending=False)

print("\nTop factors influencing car type choice:")
print(car_feature_importance.head(10))


Car Type Recommendation Model Performance:
              precision    recall  f1-score   support

   Hatchback       0.21      0.22      0.22       188
      Luxury       0.24      0.25      0.25       185
         SUV       0.22      0.21      0.22       184
       Sedan       0.28      0.28      0.28       198

    accuracy                           0.24       755
   macro avg       0.24      0.24      0.24       755
weighted avg       0.24      0.24      0.24       755


Top factors influencing car type choice:
                              Feature  Importance
4                        Flight_price    0.572974
2                          Hotel_stay    0.232146
0                     Flight_Distance    0.073098
1                     Flight_duration    0.069688
3                 Hotel_per_day_price    0.016633
5          Arrival_place_Aracaju (SE)    0.004566
9            Arrival_place_Natal (RN)    0.004494
11  Arrival_place_Rio de Janeiro (RJ)    0.004481
10          Arrival_place_Rec

# Apply car type recommendations to likely bookers

In [23]:
# Get the data for users who are likely to book cars
likely_bookers_data = recommendation_set_cleaned[recommendation_set_cleaned['User_ID'].isin(likely_car_bookers['User_ID'])]

# Prepare features for car type prediction
X_recommend_car_type = likely_bookers_data[car_type_features]

# Predict car types
recommended_car_types = car_type_pipeline.predict(X_recommend_car_type)
car_type_probabilities = car_type_pipeline.predict_proba(X_recommend_car_type)

# Get the top 3 recommended car types for each user
n_classes = len(car_type_pipeline.classes_)
top_3_indices = np.argsort(-car_type_probabilities, axis=1)[:, :3]
top_3_car_types = car_type_pipeline.classes_[top_3_indices]
top_3_probabilities = np.array([car_type_probabilities[i, top_3_indices[i]] for i in range(len(top_3_indices))])

# Create final recommendations dataframe
final_recommendations = pd.DataFrame({
    'User_ID': likely_bookers_data['User_ID'].values,
    'Recommended_Car_Type': recommended_car_types,
    'Recommendation_Confidence': np.max(car_type_probabilities, axis=1),
    'Alternative_1': top_3_car_types[:, 1],
    'Alternative_1_Confidence': top_3_probabilities[:, 1],
    'Alternative_2': top_3_car_types[:, 2],
    'Alternative_2_Confidence': top_3_probabilities[:, 2]
})

# Merge with car booking likelihood
final_recommendations = final_recommendations.merge(
    likely_car_bookers[['User_ID', 'car_booking_likelihood']], 
    on='User_ID'
)

# Sort by booking likelihood and recommendation confidence
final_recommendations = final_recommendations.sort_values(
    by=['car_booking_likelihood', 'Recommendation_Confidence'], 
    ascending=False
)

print("\nTop 10 recommendations:")
print(final_recommendations.head(10))


Top 10 recommendations:
       User_ID Recommended_Car_Type  Recommendation_Confidence Alternative_1  \
21218      209            Hatchback                   0.930000        Luxury   
15172      155               Luxury                   0.927500     Hatchback   
21240      209               Luxury                   0.919167         Sedan   
15264      155                Sedan                   0.902726        Luxury   
21212      209               Luxury                   0.890000           SUV   
15236      155               Luxury                   0.877167     Hatchback   
21256      209            Hatchback                   0.818333         Sedan   
15204      155            Hatchback                   0.817500           SUV   
15240      155               Luxury                   0.810000     Hatchback   
15152      155            Hatchback                   0.805000           SUV   

       Alternative_1_Confidence Alternative_2  Alternative_2_Confidence  \
21218              

#  Generate insights and summaries for business use

In [24]:
# 1. Distribution of recommended car types
print("\nDistribution of primary car type recommendations:")
print(final_recommendations['Recommended_Car_Type'].value_counts(normalize=True))

# 2. Average confidence by car type
print("\nAverage confidence by car type:")
confidence_by_type = final_recommendations.groupby('Recommended_Car_Type')['Recommendation_Confidence'].mean()
print(confidence_by_type.sort_values(ascending=False))

# 3. Summary statistics
print("\nSummary of recommendation campaign:")
print(f"Total users eligible for recommendations: {len(final_recommendations)}")
print(f"Average booking likelihood: {final_recommendations['car_booking_likelihood'].mean():.2f}")
print(f"Average recommendation confidence: {final_recommendations['Recommendation_Confidence'].mean():.2f}")

# 4. Breakdown by Arrival_place
place_breakdown = final_recommendations.merge(
    likely_bookers_data[['User_ID', 'Arrival_place']], 
    on='User_ID'
)
top_places = place_breakdown.groupby('Arrival_place').size().sort_values(ascending=False).head(5)
print("\nTop 5 destinations for car recommendations:")
print(top_places)


Distribution of primary car type recommendations:
Recommended_Car_Type
Hatchback    0.274817
Sedan        0.265374
SUV          0.231116
Luxury       0.228693
Name: proportion, dtype: float64

Average confidence by car type:
Recommended_Car_Type
Luxury       0.588657
Hatchback    0.570219
Sedan        0.564717
SUV          0.559918
Name: Recommendation_Confidence, dtype: float64

Summary of recommendation campaign:
Total users eligible for recommendations: 131211
Average booking likelihood: 0.77
Average recommendation confidence: 0.57

Top 5 destinations for car recommendations:
Arrival_place
Rio de Janeiro (RJ)    630499
Salvador (BH)          623044
Natal (RN)             616395
Sao Paulo (SP)         587741
Recife (PE)            557090
dtype: int64


# Improved Car Booking Likelihood Model

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier

# 1. Feature Engineering - Modified to avoid the missing columns issue
def engineer_features(df):
    df_new = df.copy()
    
    # Create interaction features with existing columns only
    if 'Flight_Distance' in df_new.columns and 'Hotel_stay' in df_new.columns:
        df_new['distance_stay_ratio'] = df_new['Flight_Distance'] / (df_new['Hotel_stay'] + 1)  # +1 to avoid division by zero
    
    if 'Flight_price' in df_new.columns and 'Hotel_TotalPrice' in df_new.columns:
        df_new['total_trip_cost'] = df_new['Flight_price'] + df_new['Hotel_TotalPrice']
        df_new['flight_price_ratio'] = df_new['Flight_price'] / (df_new['total_trip_cost'] + 1)
    
    # Let's avoid date conversion since it's causing issues
    # Instead, let's focus on the columns that exist in all datasets
    
    return df_new

# Apply feature engineering to training data
X_train_raw_enhanced = engineer_features(X_train_raw)

# Identify column types after feature engineering
numerical_cols = X_train_raw_enhanced.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_train_raw_enhanced.select_dtypes(include=['object']).columns
print(f"Numerical columns: {list(numerical_cols)}")
print(f"Categorical columns: {list(categorical_cols)}")

# 2. Create improved pipeline with XGBoost
improved_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])),
    ('classifier', XGBClassifier(
        learning_rate=0.1,
        n_estimators=200,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        scale_pos_weight=1,
        random_state=42
    ))
])

# 3. Train the improved model
improved_pipeline.fit(X_train_raw_enhanced, y_train_raw)

# 4. Apply feature engineering to test data AFTER identifying the columns
X_test_raw_enhanced = engineer_features(X_test_raw)

# 5. Evaluate on test set
y_pred_improved = improved_pipeline.predict(X_test_raw_enhanced)
y_pred_proba_improved = improved_pipeline.predict_proba(X_test_raw_enhanced)[:, 1]

print("\nImproved Model Performance:")
print(confusion_matrix(y_test, y_pred_improved))
print(classification_report(y_test, y_pred_improved))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba_improved):.4f}")

# 6. Hyperparameter tuning (smaller grid to speed up the process)
param_grid = {
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5],
    'classifier__n_estimators': [100, 200]
}

grid_search = GridSearchCV(
    improved_pipeline,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1
)

grid_search.fit(X_train_raw_enhanced, y_train_raw)
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best ROC AUC: {grid_search.best_score_:.4f}")

# Use the best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_raw_enhanced)
y_pred_proba_best = best_model.predict_proba(X_test_raw_enhanced)[:, 1]

print("\nBest Model Performance:")
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba_best):.4f}")

# 7. Apply feature engineering to recommendation set and get predictions
X_recommendation_enhanced = engineer_features(X_recommendation)
prediction_proba = best_model.predict_proba(X_recommendation_enhanced)[:, 1]

# Create recommendations dataframe with improved model
recommendations = pd.DataFrame({
    'User_ID': user_ids,
    'car_booking_likelihood': prediction_proba
})

# Sort by likelihood (highest first)
recommendations = recommendations.sort_values('car_booking_likelihood', ascending=False)

# Select users above a certain threshold for car recommendations
threshold = 0.7
likely_car_bookers = recommendations[recommendations['car_booking_likelihood'] >= threshold]

print(f"\nNumber of users with car booking likelihood >= {threshold}: {len(likely_car_bookers)}")
print(likely_car_bookers.head())

Numerical columns: ['Flight_price', 'Flight_duration', 'Flight_Distance', 'Hotel_stay', 'Hotel_per_day_price', 'Hotel_TotalPrice', 'distance_stay_ratio', 'total_trip_cost', 'flight_price_ratio']
Categorical columns: ['Flight_agency', 'Departure_date', 'Arrival_place']

Improved Model Performance:
[[330 425]
 [271 484]]
              precision    recall  f1-score   support

           0       0.55      0.44      0.49       755
           1       0.53      0.64      0.58       755

    accuracy                           0.54      1510
   macro avg       0.54      0.54      0.53      1510
weighted avg       0.54      0.54      0.53      1510

ROC AUC Score: 0.5475

Best parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Best ROC AUC: 0.2270

Best Model Performance:
[[104 651]
 [ 57 698]]
              precision    recall  f1-score   support

           0       0.65      0.14      0.23       755
           1       0.52      0.92    

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier

# If y is one-hot encoded or has 2 columns, convert to single column
if isinstance(y_train_raw, pd.DataFrame):
    # If it's a DataFrame, convert to single column
    y_train_1d = y_train_raw.iloc[:, 0]
    y_test_1d = y_test.iloc[:, 0]
elif isinstance(y_train_raw, np.ndarray):
    # If it's a numpy array with 2 columns, take first column
    y_train_1d = y_train_raw[:, 0]
    y_test_1d = y_test[:, 0]
else:
    y_train_1d = y_train_raw
    y_test_1d = y_test

def engineer_features(df):
    df_new = df.copy()
    
    if 'Flight_Distance' in df_new.columns and 'Hotel_stay' in df_new.columns:
        df_new['distance_stay_ratio'] = df_new['Flight_Distance'] / (df_new['Hotel_stay'] + 1)
    
    if 'Flight_price' in df_new.columns and 'Hotel_TotalPrice' in df_new.columns:
        df_new['total_trip_cost'] = df_new['Flight_price'] + df_new['Hotel_TotalPrice']
        df_new['flight_price_ratio'] = df_new['Flight_price'] / (df_new['total_trip_cost'] + 1)
    
    return df_new

# Apply feature engineering
X_train_raw_enhanced = engineer_features(X_train_raw)
X_test_raw_enhanced = engineer_features(X_test_raw)

# Print shape information for debugging
print(f"X_train shape: {X_train_raw_enhanced.shape}")
print(f"y_train shape: {y_train_1d.shape}")
print(f"X_test shape: {X_test_raw_enhanced.shape}")
print(f"y_test shape: {y_test_1d.shape}")

# Identify column types
numerical_cols = X_train_raw_enhanced.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_train_raw_enhanced.select_dtypes(include=['object']).columns

print(f"\nNumerical columns: {list(numerical_cols)}")
print(f"Categorical columns: {list(categorical_cols)}")

# Create pipeline
improved_pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])),
    ('classifier', XGBClassifier(
        learning_rate=0.1,
        n_estimators=200,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',  # Changed to binary classification
        random_state=42
    ))
])

# Train the model
improved_pipeline.fit(X_train_raw_enhanced, y_train_1d)

# Make predictions
y_pred_improved = improved_pipeline.predict(X_test_raw_enhanced)
y_pred_proba_improved = improved_pipeline.predict_proba(X_test_raw_enhanced)

print("\nImproved Model Performance:")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_1d, y_pred_improved))
print("\nClassification Report:")
print(classification_report(y_test_1d, y_pred_improved))

# Hyperparameter tuning
param_grid = {
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5],
    'classifier__n_estimators': [100, 200]
}

grid_search = GridSearchCV(
    improved_pipeline,
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train_raw_enhanced, y_train_1d)
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_:.4f}")

# Use the best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_raw_enhanced)
y_pred_proba_best = best_model.predict_proba(X_test_raw_enhanced)

print("\nBest Model Performance:")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_1d, y_pred_best))
print("\nClassification Report:")
print(classification_report(y_test_1d, y_pred_best))

# Make recommendations
X_recommendation_enhanced = engineer_features(X_recommendation)
prediction_proba = best_model.predict_proba(X_recommendation_enhanced)

# Create recommendations dataframe
recommendations = pd.DataFrame({
    'User_ID': user_ids,
    'probability_class_0': prediction_proba[:, 0],
    'probability_class_1': prediction_proba[:, 1],
    'predicted_class': best_model.predict(X_recommendation_enhanced)
})

# Add confidence score
recommendations['confidence'] = recommendations[['probability_class_0', 'probability_class_1']].max(axis=1)

# Sort by confidence
recommendations = recommendations.sort_values('confidence', ascending=False)

print("\nTop recommendations:")
print(recommendations.head())

X_train shape: (7546, 12)
y_train shape: (7546,)
X_test shape: (1510, 12)
y_test shape: (1510,)

Numerical columns: ['Flight_price', 'Flight_duration', 'Flight_Distance', 'Hotel_stay', 'Hotel_per_day_price', 'Hotel_TotalPrice', 'distance_stay_ratio', 'total_trip_cost', 'flight_price_ratio']
Categorical columns: ['Flight_agency', 'Departure_date', 'Arrival_place']

Improved Model Performance:

Confusion Matrix:
[[330 425]
 [271 484]]

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.44      0.49       755
           1       0.53      0.64      0.58       755

    accuracy                           0.54      1510
   macro avg       0.54      0.54      0.53      1510
weighted avg       0.54      0.54      0.53      1510


Best parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Best accuracy: 0.2980

Best Model Performance:

Confusion Matrix:
[[104 651]
 [ 57 698]]

Classif

In [27]:
# 1. Feature Engineering for car type prediction (unchanged)
def engineer_car_type_features(df):
    df_new = df.copy()
    
    # Budget indicator features
    if 'Flight_price' in df_new.columns and 'Hotel_per_day_price' in df_new.columns:
        df_new['total_daily_budget'] = df_new['Flight_price'] / 7 + df_new['Hotel_per_day_price']
    
    # Trip length indicators
    if 'Hotel_stay' in df_new.columns:
        df_new['long_trip'] = (df_new['Hotel_stay'] > 7).astype(int)
    
    # Distance categories
    if 'Flight_Distance' in df_new.columns:
        df_new['distance_category_num'] = pd.cut(
            df_new['Flight_Distance'],
            bins=[0, 500, 1500, 3000, float('inf')],
            labels=[1, 2, 3, 4]
        ).astype(float)
    
    return df_new

# 2. Add label encoding for target variable
from sklearn.preprocessing import LabelEncoder

# Initialize and fit label encoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_car)
y_test_encoded = label_encoder.transform(y_test_car)

# Apply feature engineering
X_train_car_enhanced = engineer_car_type_features(X_train_car)
X_test_car_enhanced = engineer_car_type_features(X_test_car)

# Update columns after feature engineering
numerical_cols_car = X_train_car_enhanced.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_car = X_train_car_enhanced.select_dtypes(include=['object', 'category']).columns

# 3. Create improved car type pipeline
car_type_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols_car),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_car)
    ])

improved_car_type_pipeline = Pipeline(steps=[
    ('preprocessor', car_type_preprocessor),
    ('classifier', XGBClassifier(
        learning_rate=0.1,
        n_estimators=150,
        max_depth=5,
        objective='multi:softprob',
        num_class=len(label_encoder.classes_),
        random_state=42
    ))
])

# 4. Train the improved car type model
improved_car_type_pipeline.fit(X_train_car_enhanced, y_train_encoded)

# 5. Evaluate on test set
y_pred_encoded = improved_car_type_pipeline.predict(X_test_car_enhanced)
y_pred_car_improved = label_encoder.inverse_transform(y_pred_encoded)
print("\nImproved Car Type Model Performance:")
print(classification_report(y_test_car, y_pred_car_improved))

# 6. Apply improved car type model to likely bookers
X_recommend_car_type_enhanced = engineer_car_type_features(X_recommend_car_type)
recommended_car_types_encoded = improved_car_type_pipeline.predict(X_recommend_car_type_enhanced)
car_type_probabilities = improved_car_type_pipeline.predict_proba(X_recommend_car_type_enhanced)

# Get the top 3 recommended car types for each user
top_3_indices = np.argsort(-car_type_probabilities, axis=1)[:, :3]

# Create arrays for each alternative recommendation
recommended_car_types = label_encoder.inverse_transform(recommended_car_types_encoded)
alternative_1 = label_encoder.inverse_transform(top_3_indices[:, 1])
alternative_2 = label_encoder.inverse_transform(top_3_indices[:, 2])

# Get corresponding probabilities
top_3_probabilities = np.array([car_type_probabilities[i, top_3_indices[i]] for i in range(len(top_3_indices))])

# 7. Create final recommendations dataframe
final_recommendations = pd.DataFrame({
    'User_ID': likely_bookers_data['User_ID'].values,
    'Recommended_Car_Type': recommended_car_types,
    'Recommendation_Confidence': np.max(car_type_probabilities, axis=1),
    'Alternative_1': alternative_1,
    'Alternative_1_Confidence': top_3_probabilities[:, 1],
    'Alternative_2': alternative_2,
    'Alternative_2_Confidence': top_3_probabilities[:, 2]
})

# Merge with improved car booking likelihood
final_recommendations = final_recommendations.merge(
    likely_car_bookers[['User_ID', 'car_booking_likelihood']], 
    on='User_ID'
)

# Sort by booking likelihood and recommendation confidence
final_recommendations = final_recommendations.sort_values(
    by=['car_booking_likelihood', 'Recommendation_Confidence'], 
    ascending=False
)

print("\nFinal Recommendations (Top 10):")
print(final_recommendations.head(10))


Improved Car Type Model Performance:
              precision    recall  f1-score   support

   Hatchback       0.22      0.23      0.23       188
      Luxury       0.23      0.24      0.24       185
         SUV       0.25      0.21      0.23       184
       Sedan       0.27      0.28      0.27       198

    accuracy                           0.24       755
   macro avg       0.24      0.24      0.24       755
weighted avg       0.24      0.24      0.24       755


Final Recommendations (Top 10):
Empty DataFrame
Columns: [User_ID, Recommended_Car_Type, Recommendation_Confidence, Alternative_1, Alternative_1_Confidence, Alternative_2, Alternative_2_Confidence, car_booking_likelihood]
Index: []


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
import lightgbm as lgb
from sklearn.cluster import KMeans

class EnhancedCarRecommender:
    def __init__(self):
        self.booking_model = None
        self.car_type_model = None
        self.user_segments = None
        self.label_encoder = LabelEncoder()
        self.version = "1.0.0"
        self.recommendation_history = []
        
    def engineer_features(self, df):
        """Enhanced feature engineering with temporal and user segments"""
        df_new = df.copy()
        
        # Basic features from original implementation
        if 'Flight_Distance' in df_new.columns and 'Hotel_stay' in df_new.columns:
            df_new['distance_stay_ratio'] = df_new['Flight_Distance'] / (df_new['Hotel_stay'] + 1)
        
        if 'Flight_price' in df_new.columns and 'Hotel_TotalPrice' in df_new.columns:
            df_new['total_trip_cost'] = df_new['Flight_price'] + df_new['Hotel_TotalPrice']
            df_new['flight_price_ratio'] = df_new['Flight_price'] / (df_new['total_trip_cost'] + 1)
        
        # Add temporal features
        if 'Check_in_Hotel' in df_new.columns:
            df_new['Check_in_Hotel'] = pd.to_datetime(df_new['Check_in_Hotel'])
            df_new['month'] = df_new['Check_in_Hotel'].dt.month
            df_new['day_of_week'] = df_new['Check_in_Hotel'].dt.dayofweek
            df_new['is_weekend'] = df_new['day_of_week'].isin([5, 6]).astype(int)
            df_new['is_holiday_season'] = df_new['month'].isin([7, 8, 12]).astype(int)
        
        # Add location-based features
        if 'Arrival_place' in df_new.columns:
            popular_destinations = ['New York', 'Los Angeles', 'Chicago', 'Miami', 'Las Vegas']
            df_new['is_popular_destination'] = df_new['Arrival_place'].isin(popular_destinations).astype(int)
        
        return df_new
    
    def create_user_segments(self, df):
        """Create user segments based on behavior patterns"""
        features_for_clustering = ['total_trip_cost', 'Hotel_stay', 'Flight_Distance']
        
        # Prepare data for clustering
        cluster_data = df[features_for_clustering].copy()
        scaler = StandardScaler()
        cluster_data_scaled = scaler.fit_transform(cluster_data)
        
        # Perform clustering
        kmeans = KMeans(n_clusters=4, random_state=42)
        df['user_segment'] = kmeans.fit_predict(cluster_data_scaled)
        
        return df
    
    def handle_missing_data(self, df):
        """Handle missing data with appropriate strategies"""
        # Fill numerical columns with median
        numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
        df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
        
        # Fill categorical columns with mode
        categorical_cols = df.select_dtypes(include=['object']).columns
        df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
        
        return df
    
    def prepare_temporal_cv(self, df):
        """Prepare time-based cross validation"""
        tscv = TimeSeriesSplit(n_splits=5)
        return tscv
    
    def fit(self, flights_data, hotels_data, cars_data):
        """Train the enhanced recommendation system"""
        # Merge datasets
        merged_data = self._merge_datasets(flights_data, hotels_data, cars_data)
        
        # Handle missing data
        merged_data = self.handle_missing_data(merged_data)
        
        # Create user segments
        merged_data = self.create_user_segments(merged_data)
        
        # Engineer features
        X = self.engineer_features(merged_data)
        
        # Prepare target variables
        y_booking = (merged_data['car_booking'] == 1).astype(int)
        y_car_type = self.label_encoder.fit_transform(merged_data[merged_data['car_booking'] == 1]['carType'])
        
        # Train booking likelihood model
        self.booking_model = self._train_booking_model(X, y_booking)
        
        # Train car type model
        self.car_type_model = self._train_car_type_model(
            X[merged_data['car_booking'] == 1],
            y_car_type
        )
        
        return self
    
    def _train_booking_model(self, X, y):
        """Train an enhanced booking likelihood model"""
        # Define features
        numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
        categorical_cols = X.select_dtypes(include=['object']).columns
        
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', ColumnTransformer([
                ('num', StandardScaler(), numerical_cols),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
            ])),
            ('classifier', lgb.LGBMClassifier(
                objective='binary',
                boosting_type='dart',
                n_estimators=200,
                learning_rate=0.1,
                max_depth=5,
                num_leaves=31,
                feature_fraction=0.9
            ))
        ])
        
        # Train model
        pipeline.fit(X, y)
        return pipeline
    
    def _train_car_type_model(self, X, y):
        """Train an enhanced car type recommendation model"""
        numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
        categorical_cols = X.select_dtypes(include=['object']).columns
        
        pipeline = Pipeline([
            ('preprocessor', ColumnTransformer([
                ('num', StandardScaler(), numerical_cols),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
            ])),
            ('classifier', XGBClassifier(
                objective='multi:softprob',
                n_estimators=150,
                max_depth=5,
                learning_rate=0.1
            ))
        ])
        
        pipeline.fit(X, y)
        return pipeline
    
    def recommend(self, user_data):
        """Generate recommendations with monitoring"""
        # Engineer features
        X = self.engineer_features(user_data)
        X = self.handle_missing_data(X)
        X = self.create_user_segments(X)
        
        # Get booking likelihood
        booking_proba = self.booking_model.predict_proba(X)[:, 1]
        
        # Get car type recommendations for likely bookers
        likely_bookers = booking_proba >= 0.7
        car_type_proba = None
        if likely_bookers.any():
            car_type_proba = self.car_type_model.predict_proba(X[likely_bookers])
        
        # Create recommendations
        recommendations = self._create_recommendation_df(
            user_data,
            booking_proba,
            car_type_proba,
            likely_bookers
        )
        
        # Log recommendations
        self._log_recommendations(recommendations)
        
        return recommendations
    
    def _create_recommendation_df(self, user_data, booking_proba, car_type_proba, likely_bookers):
        """Create detailed recommendations dataframe"""
        recommendations = pd.DataFrame({
            'User_ID': user_data['User_ID'],
            'booking_likelihood': booking_proba,
            'timestamp': datetime.now(),
            'model_version': self.version
        })
        
        if car_type_proba is not None and likely_bookers.any():
            # Get top 3 car types
            top_3_indices = np.argsort(-car_type_proba, axis=1)[:, :3]
            car_types = self.label_encoder.inverse_transform(range(car_type_proba.shape[1]))
            
            # Add car type recommendations
            recommendations.loc[likely_bookers, 'recommended_car_type'] = car_types[top_3_indices[:, 0]]
            recommendations.loc[likely_bookers, 'alternative_1'] = car_types[top_3_indices[:, 1]]
            recommendations.loc[likely_bookers, 'alternative_2'] = car_types[top_3_indices[:, 2]]
            
            # Add confidence scores
            recommendations.loc[likely_bookers, 'recommendation_confidence'] = np.max(car_type_proba, axis=1)
            
        return recommendations
    
    def _log_recommendations(self, recommendations):
        """Log recommendations for monitoring"""
        self.recommendation_history.append({
            'timestamp': datetime.now(),
            'num_recommendations': len(recommendations),
            'avg_booking_likelihood': recommendations['booking_likelihood'].mean(),
            'model_version': self.version
        })
    
    def get_performance_metrics(self):
        """Get system performance metrics"""
        if not self.recommendation_history:
            return "No recommendations made yet"
        
        metrics = pd.DataFrame(self.recommendation_history)
        
        return {
            'total_recommendations': metrics['num_recommendations'].sum(),
            'avg_booking_likelihood': metrics['avg_booking_likelihood'].mean(),
            'recommendations_per_day': metrics.groupby(
                metrics['timestamp'].dt.date)['num_recommendations'].mean()
        }


In [3]:

# Initialize the recommender
recommender = EnhancedCarRecommender()

In [4]:
# Load the datasets
flights_data = pd.read_excel(r"D:\Make_my_trip\FinalDataset\FlightFINALdataset.xlsx")
hotels_data = pd.read_excel(r"D:\Make_my_trip\FinalDataset\HotelFINALdataset.xlsx")
cars_data = pd.read_excel(r"D:\Make_my_trip\FinalDataset\CarFINALdataset.xlsx")

In [5]:

# Train the system
recommender.fit(flights_data, hotels_data, cars_data)

AttributeError: 'EnhancedCarRecommender' object has no attribute '_merge_datasets'

In [None]:
user_data = pd.read_excel(r"D:\Make_my_trip\FinalDataset\updated_user.csv")

In [None]:
# Generate recommendations
recommendations = recommender.recommend(user_data)

In [None]:
# Get performance metrics
metrics = recommender.get_performance_metrics()
