In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, make_scorer

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Handle missing values
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df.drop(columns=['Cabin'], inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Remove duplicate data
train_df.drop_duplicates(inplace=True)

# Separate features and target variable from training data
X = train_df.drop(columns=['Survived'])
y = train_df['Survived']

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# List of numerical and categorical columns
num_cols = ['Age', 'Fare', 'SibSp', 'Parch']
cat_cols = ['Pclass', 'Sex', 'Embarked']

# Preprocessing for numerical data: scaling
num_transformer = StandardScaler()

# Preprocessing for categorical data: one-hot encoding
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

# Create a pipeline that combines preprocessing with the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define the hyperparameters to tune
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Define the scoring metric
scoring = {
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scoring, refit='precision', return_train_score=True)

# Fit the model
grid_search.fit(X_train, y_train)

# Evaluate on the validation set
y_pred = grid_search.predict(X_val)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)

# Display the best parameters and the precision and recall scores
results = {
    'best_params': grid_search.best_params_,
    'precision': precision,
    'recall': recall
}

results


{'best_params': {'classifier__max_depth': None,
  'classifier__min_samples_leaf': 4,
  'classifier__min_samples_split': 2,
  'classifier__n_estimators': 100},
 'precision': 0.8253968253968254,
 'recall': 0.7027027027027027}

In [None]:
# Pipeline:

# We create a pipeline that includes preprocessing steps (standardization and one-hot encoding) and the Random Forest classifier.
# Hyperparameter Tuning:

# We define a grid of hyperparameters to tune: number of estimators, max depth, min samples split, and min samples leaf.
# GridSearchCV is used for hyperparameter tuning with 5-fold cross-validation. The model is evaluated using precision and recall metrics.
# Model Evaluation:

# The best hyperparameters are identified based on the precision score.
# We evaluate the final model on the validation set and report precision and recall scores.