# Titanic: train and save best model

In [4]:
import pandas as pd
from pathlib import Path
import joblib
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

In [5]:
BASE = Path('.')
DATA = BASE / 'train.csv'
OUT_LOCAL = BASE / 'best_model.pkl'
OUT_WEBAPP = BASE.parent / 'model.pkl'
print('data ->', DATA.resolve())

data -> C:\Users\Ilyas\Projects\MLOps Labs\lab1-flask\furniture_prediction\assignment\train.csv


In [6]:
df = pd.read_csv(DATA)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# preprocess
NUM_COLS = ['Age','SibSp','Parch','Fare']
CAT_COLS = ['Sex','Embarked']
REMAIN = ['Pclass']

df['Survived'] = pd.to_numeric(df['Survived'], errors='coerce')
df['Embarked'] = df['Embarked'].fillna('S')
X = df[NUM_COLS + CAT_COLS + REMAIN]
y = df['Survived']
X.shape, y.shape

((891, 7), (891,))

In [8]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num', num_pipeline, NUM_COLS), ('cat', cat_pipeline, CAT_COLS + REMAIN)])
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}
results = {}
for name, m in models.items():
    pipe = Pipeline([('pre', pre), ('clf', m)])
    scores = cross_val_score(pipe, X, y, cv=5, scoring='roc_auc', n_jobs=-1)
    results[name] = scores.mean()
    print(f'{name}: ROC AUC = {scores.mean():.4f}')

sorted(results.items(), key=lambda x: x[1], reverse=True)

LogisticRegression: ROC AUC = 0.8484
RandomForest: ROC AUC = 0.8602
GradientBoosting: ROC AUC = 0.8686


[('GradientBoosting', np.float64(0.868564781535053)),
 ('RandomForest', np.float64(0.8601657693573858)),
 ('LogisticRegression', np.float64(0.8483605974572445))]

In [9]:
best_name = max(results, key=results.get)
best_model = models[best_name]
best_pipe = Pipeline([('pre', pre), ('clf', best_model)])
best_pipe.fit(X, y)
joblib.dump(best_pipe, OUT_LOCAL)
joblib.dump(best_pipe, OUT_WEBAPP)
print('Saved best model:', best_name)

Saved best model: GradientBoosting
