In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Handle missing values
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df.drop(columns=['Cabin'], inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Remove duplicate data
train_df.drop_duplicates(inplace=True)

# Separate features and target variable from training data
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# List of numerical and categorical columns
num_cols = ['Age', 'Fare', 'SibSp', 'Parch']
cat_cols = ['Pclass', 'Sex', 'Embarked']

# Preprocessing for numerical data: scaling
num_transformer = StandardScaler()

# Preprocessing for categorical data: one-hot encoding
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Define the models and the hyperparameters to tune
models = {
    'logistic_regression': LogisticRegression(),
    'random_forest': RandomForestClassifier()
}

param_grids = {
    'logistic_regression': {
        'classifier__C': [0.1, 1, 10, 100]
    },
    'random_forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30]
    }
}

# Create a pipeline that combines preprocessing with the model
results = {}
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Define the scoring metric
    scoring = {
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score)
    }
    
    # Create the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring=scoring, refit='precision', return_train_score=True)
    
    # Fit the model
    grid_search.fit(X_train, y_train)
    
    # Evaluate on the validation set
    y_pred = grid_search.predict(X_val)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    
    # Store the results
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'precision': precision,
        'recall': recall
    }

# Display the results
results


{'logistic_regression': {'best_params': {'classifier__C': 0.1},
  'precision': 0.7878787878787878,
  'recall': 0.7027027027027027},
 'random_forest': {'best_params': {'classifier__max_depth': 10,
   'classifier__n_estimators': 100},
  'precision': 0.8181818181818182,
  'recall': 0.7297297297297297}}

In [None]:
# The Random Forest model is chosen as the best fit for the Titanic dataset problem due to its superior 
# performance in precision and recall, its ability to handle complex non-linear relationships, and its 
# robustness and flexibility in dealing with different types of data.