Step 1: Set Up the Basic ML Pipeline


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the cleaned Titanic dataset
data = pd.read_csv('titanic_cleaned.csv')

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(data.head())

# Separate features and target variable
X = data.drop(['Survived'], axis=1)
y = data['Survived']

# Select categorical and numerical columns for preprocessing
categorical_cols = ['Pclass', 'Sex', 'Embarked']
numerical_cols = ['Age', 'Fare', 'FamilySize']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numeric features (scaling) and categorical features (one-hot encoding)
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer for applying transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline that combines the preprocessor and the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the pipeline to a file
joblib.dump(pipeline, 'titanic_pipeline.pkl')
print("\nPipeline saved as 'titanic_pipeline.pkl'")
