In [2]:
import pandas as pd

filepath = "C:\\Users\\mango\\OneDrive\\Desktop\\Data Analytics Projects\\British Airways\\customer_booking.csv"
# Load dataset
df = pd.read_csv(filepath, encoding = 'latin1')

# Basic exploration
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())


   num_passengers sales_channel  trip_type  purchase_lead  length_of_stay  \
0               2      Internet  RoundTrip            262              19   
1               1      Internet  RoundTrip            112              20   
2               2      Internet  RoundTrip            243              22   
3               1      Internet  RoundTrip             96              31   
4               2      Internet  RoundTrip             68              22   

   flight_hour flight_day   route booking_origin  wants_extra_baggage  \
0            7        Sat  AKLDEL    New Zealand                    1   
1            3        Sat  AKLDEL    New Zealand                    0   
2           17        Wed  AKLDEL          India                    1   
3            4        Sat  AKLDEL    New Zealand                    0   
4           15        Wed  AKLDEL          India                    1   

   wants_preferred_seat  wants_in_flight_meals  flight_duration  \
0                     0        

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming df is your DataFrame containing the dataset

# Define features and target
X = df.drop(columns=['booking_complete'])
y = df['booking_complete']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Create a preprocessor for the categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ], remainder='passthrough'  # Use the remaining columns as they are
)

# Create a pipeline with preprocessing and the Random Forest classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

# Confusion matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)

print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV accuracy: {np.mean(cv_scores):.2f} +/- {np.std(cv_scores):.2f}')

# Extract feature importances from the trained model
# Accessing feature names after OneHotEncoder within ColumnTransformer
# OneHotEncoder.get_feature_names_out() for recent versions of sklearn
one_hot_encoder = model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehotencoder']
one_hot_encoded_columns = one_hot_encoder.get_feature_names_out(input_features=categorical_cols)
feature_names = list(numerical_cols) + list(one_hot_encoded_columns)

# Create a DataFrame to visualize feature importance
feature_importances = model.named_steps['classifier'].feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plotting feature importances
plt.figure(figsize=(10, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature on top
plt.show()


Accuracy: 0.8551
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.98      0.92      8520
           1       0.54      0.13      0.21      1480

    accuracy                           0.86     10000
   macro avg       0.71      0.56      0.56     10000
weighted avg       0.82      0.86      0.81     10000


Confusion Matrix:
[[8360  160]
 [1289  191]]
