## Model Trainer
### Steps
- Feature Selection
- Feature Engineering
- Categorical encoding
- Feature Scaling
- Splitting dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt


#### Import the csv data as pandas dataframe

In [10]:
df=pd.read_csv(r'C:\Users\Bonareri\Passenger_Satisfaction_Prediction\notebook\clean_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19800 entries, 0 to 19799
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip ID                19800 non-null  object 
 1   User ID                19800 non-null  object 
 2   Destination            19800 non-null  object 
 3   Departure Date         19800 non-null  object 
 4   Return Date            19800 non-null  object 
 5   Travel Type            19800 non-null  object 
 6   Transportation Mode    19800 non-null  object 
 7   Hotel Rating           19800 non-null  int64  
 8   Total Cost ($)         19800 non-null  float64
 9   Customer_Satisfaction  19800 non-null  float64
 10  Satisfaction_level     19800 non-null  object 
 11  Departure Month        19800 non-null  object 
 12  Trip_Duration          19800 non-null  int64  
dtypes: float64(2), int64(2), object(9)
memory usage: 2.0+ MB


#### Drop irrelevant columns

In [11]:
# 1. Drop irrelevant columns
df = df.drop(columns=['Trip ID', 'User ID','Departure Date', 'Return Date' ,'Customer_Satisfaction'], axis=1)

#### Separate features and target

In [12]:
# 3. Separate features and target
X = df.drop(columns='Satisfaction_level', axis=1)
y = df['Satisfaction_level']

In [None]:
# 4. Train_test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19800 entries, 0 to 19799
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Destination          19800 non-null  object 
 1   Travel Type          19800 non-null  object 
 2   Transportation Mode  19800 non-null  object 
 3   Hotel Rating         19800 non-null  int64  
 4   Total Cost ($)       19800 non-null  float64
 5   Satisfaction_level   19800 non-null  object 
 6   Departure Month      19800 non-null  object 
 7   Trip_Duration        19800 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 1.2+ MB


In [None]:

# 5. Preprocessing(Encoding and standardization)
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns


preprocessor = ColumnTransformer(
    transformers=[
        # Numerical: Imputation + Scaling
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        
        # Categorical: Imputation + OneHotEncoding
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore',drop='first'))
        ]), categorical_features)
    ]
)

In [None]:
# 6. Model definitions
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Naive Bayes': GaussianNB()
}

In [25]:
X_train.shape, X_test.shape

((15840, 7), (3960, 7))

In [None]:
# 7. Train, Predict, and Evaluate each model
for name, model in models.items():
    print(f"\n--- {name} ---")
    
    # Build pipeline (Preprocessing + Model)
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train Model
    
    pipeline.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluation metrics
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d')
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()