## Load and Clean Data

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [2]:
def load_data(data_file):
    data = pd.read_csv(data_file)
    return data

def clean_data(data):
    imputer = SimpleImputer(strategy='mean')
    data['Age'] = imputer.fit_transform(data['Age'].values.reshape(-1, 1))
    return data

In [3]:
data_file = '../data/titanic.csv'
data = load_data(data_file)
cleaned_data = clean_data(data)

## Data Preprocessing

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(data):
    X = data.drop('Survived', axis=1)
    y = data['Survived']
    X = pd.get_dummies(X, columns=['Sex'], drop_first=True)
    X = X.drop('Name', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = preprocess_data(cleaned_data)

## Feature Engineering

In [6]:
import numpy as np

def create_features(data):
    # Exclude non-numeric columns from imputation
    numeric_data = data.select_dtypes(include=[np.number])
    imputer = SimpleImputer(strategy='mean')
    numeric_data = pd.DataFrame(imputer.fit_transform(numeric_data), columns=numeric_data.columns)

    # Combine imputed numeric data with non-numeric data
    non_numeric_data = data.select_dtypes(exclude=[np.number])
    combined_data = pd.concat([non_numeric_data, numeric_data], axis=1)

    return combined_data

In [7]:
cleaned_data = create_features(cleaned_data)

## Train and Evaluate the Model

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def train_model(X_train, X_test, y_train, y_test):
    # Initialize the RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Evaluate the model's performance (you can use other metrics as well)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

In [9]:
trained_model = train_model(X_train, X_test, y_train, y_test)

Accuracy: 0.7752808988764045


## Save the Trained Model

In [10]:
import joblib

# Save the trained model to a file
model_file = 'trained_model.pkl'
joblib.dump(trained_model, model_file)

['trained_model.pkl']

## MLFLOW

In [11]:
import mlflow


def train_model(X_train, X_test, y_train, y_test):
    """
    Train a RandomForestClassifier on preprocessed data and evaluate its performance.

    Args:
        X_train (pd.DataFrame): Features for training.
        X_test (pd.DataFrame): Features for testing.
        y_train (pd.Series): Target labels for training.
        y_test (pd.Series): Target labels for testing.
    """
    # Prompt user for parameter values
    n_estimators = int(input("Enter the n-estimators parameter: "))
    random_state = int(input("Enter random_state parameter: "))
    # Initialize the RandomForestClassifier
    model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Evaluate the model's performance (you can use other metrics as well)
    accuracy = accuracy_score(y_test, y_pred)




    # Log parameters and metrics in MLflow
    with mlflow.start_run():
        mlflow.log_params({
            "n_estimators": n_estimators,
            "random_state": random_state,
        })
        mlflow.log_metrics({
            "accuracy": accuracy,
        })

    print(f"Accuracy: {accuracy}")

    return model

In [12]:
train_model(X_train, X_test, y_train, y_test)

In [None]:
# mlflow ui in the terminal