In [None]:


import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import numpy as np


np.random.seed(0)
data = pd.DataFrame({
    'Survived': np.random.randint(0, 2, 1000),
    'Age': np.random.randint(18, 100, 1000),
    'Fare': np.random.uniform(10, 100, 1000),
    'Sex': np.random.choice(['male', 'female'], 1000),
    'Embarked': np.random.choice(['S', 'C', 'Q'], 1000)
})

# Save the dataset to a CSV file
data.to_csv('titanic.csv', index=False)

def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        return data
    except Exception as e:
        print("Error occurred while loading data:", str(e))

def preprocess_data(data):
    numerical_cols = ['Age', 'Fare']
    categorical_cols = ['Sex', 'Embarked']
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])
    X = data.drop(['Survived'], axis=1)
    y = data['Survived']
    return X, y, preprocessor

def train_model(X, y, preprocessor):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('clf', RandomForestClassifier(random_state=42))])
    param_grid = {
        'clf__n_estimators': [100, 200, 300],
        'clf__max_depth': [None, 5, 10],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 5, 10]
    }
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    return grid_search

def evaluate_model(grid_search, X, y):
    best_model = grid_search.best_estimator_
    scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')
    print("Average Cross-Validation Accuracy:", scores.mean())
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)

data = load_data('titanic.csv')
X, y, preprocessor = preprocess_data(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search = train_model(X_train, y_train, preprocessor)
evaluate_model(grid_search, X_train, y_train)
y_pred = grid_search.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))