In [2]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
 import seaborn as sns

In [4]:
 df=sns.load_dataset('tips')

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
 X=df.iloc[:,1:]
 y=df['total_bill']

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [13]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, 
strategy="mean")),
    ("scaler", StandardScaler()),
    ]
 )

In [14]:
 from sklearn import set_config

In [15]:
set_config(display='diagram')

In [16]:
numeric_preprocessor

In [39]:
Pipeline(steps=[('imputation_mean', SimpleImputer()),
                ('scaler', StandardScaler())])

In [40]:
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
 )

In [41]:
 preprocessor=Pipeline(
    steps=[("categorical",categorical_preprocessor),
 ("numerical",numeric_preprocessor)]
 )

In [42]:
 preprocessor

In [43]:
 pipe=Pipeline(
    [("preprocessor",preprocessor),
 ("regressor",RandomForestRegressor())]
 )

In [44]:
 pipe

In [46]:
from sklearn.compose import ColumnTransformer

# Define categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Correct: Use ColumnTransformer to apply preprocessors column-wise
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_preprocessor, categorical_cols),
        ("numerical", numeric_preprocessor, numerical_cols),
    ]
)

pipe = Pipeline(
    [("preprocessor", preprocessor), ("regressor", RandomForestRegressor())]
)

pipe.fit(X_train, y_train)

In [2]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load dataset 
df = sns.load_dataset('tips') 

In [5]:
# Features and target 
X = df.drop('total_bill', axis=1) 
y = df['total_bill'] > df['total_bill'].median()  # Convert to binary classification for demo

In [6]:

# Split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
random_state=42)

In [7]:
# Identify columns 
categorical_cols = X.select_dtypes(include='object').columns.tolist() 
numerical_cols = X.select_dtypes(include=['int64', 
'float64']).columns.tolist()

In [8]:
# Pipelines 
numeric_preprocessor = Pipeline([ 
('imputer', SimpleImputer(strategy='mean')), 
('scaler', StandardScaler()) 
]) 
categorical_preprocessor = Pipeline([ 
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
('encoder', OneHotEncoder(handle_unknown='ignore')) 
])

In [9]:
preprocessor = ColumnTransformer([ 
('num', numeric_preprocessor, numerical_cols), 
('cat', categorical_preprocessor, categorical_cols) 
])

In [10]:
# Define models 
models = { 
'Decision Tree': DecisionTreeClassifier(), 
    'Random Forest': RandomForestClassifier(), 
    'Logistic Regression': LogisticRegression(max_iter=200) 
}

In [11]:
# Dictionary to store results 
results = {} 

In [12]:
# Training and Evaluation with Pipeline 
for name, model in models.items(): 
    print(f"\nTraining Model: {name}") 
     
    


Training Model: Decision Tree

Training Model: Random Forest

Training Model: Logistic Regression


In [16]:
# Create pipeline 
pipe = Pipeline(steps=[ 
        ('preprocessor', preprocessor), 
        ('classifier', model) 
    ])

In [17]:
  # Fit 
pipe.fit(X_train, y_train) 
     
   

In [19]:
 # Predict 
train_pred = pipe.predict(X_train) 
test_pred = pipe.predict(X_test) 

In [22]:
 # Metrics 
train_acc = accuracy_score(y_train, train_pred) 
test_acc = accuracy_score(y_test, test_pred) 
     
results[name] = { 
        'model': pipe, 
        'train_accuracy': train_acc, 
        'test_accuracy': test_acc, 
        'train_predictions': train_pred, 
        'test_predictions': test_pred, 
        'confusion_matrix': confusion_matrix(y_test, test_pred), 
        'classification_report': classification_report(y_test, test_pred) 
    }

In [23]:
# Summary 
print("\nMODEL PERFORMANCE SUMMARY:") 
for name, info in results.items(): 
    print(f"\n=== {name} ===") 
    print(f"Training Accuracy: {info['train_accuracy']:.4f}") 
    print(f"Testing Accuracy: {info['test_accuracy']:.4f}") 
    print("Confusion Matrix (Test):") 
    print(info['confusion_matrix']) 
    print("Classification Report (Test):") 
    print(info['classification_report'])


MODEL PERFORMANCE SUMMARY:

=== Logistic Regression ===
Training Accuracy: 0.7487
Testing Accuracy: 0.7755
Confusion Matrix (Test):
[[23  3]
 [ 8 15]]
Classification Report (Test):
              precision    recall  f1-score   support

       False       0.74      0.88      0.81        26
        True       0.83      0.65      0.73        23

    accuracy                           0.78        49
   macro avg       0.79      0.77      0.77        49
weighted avg       0.78      0.78      0.77        49



In [25]:
# Best model by test accuracy 
best_model = max(results.items(), key=lambda x: x[1]['test_accuracy']) 
print(f"\nBest Model: {best_model[0]} with Test Accuracy: {best_model[1]['test_accuracy']:.4f}")


Best Model: Logistic Regression with Test Accuracy: 0.7755
