In [2]:
import pandas as pd

In [4]:
pdf = pd.read_csv('./data/titanic/train.csv')

In [7]:
def explore_dataset(dataframe):
    explanation = ""

    # Dataset structure, features, and target variable
    explanation += "Dataset Structure:\n"
    explanation += str(dataframe.info()) + "\n\n"

    # Summary statistics of numerical features
    explanation += "Summary Statistics of Numerical Features:\n"
    explanation += str(dataframe.describe()) + "\n\n"

    # Summary statistics of categorical features
    explanation += "Summary Statistics of Categorical Features:\n"
    explanation += str(dataframe.describe(include=['O'])) + "\n\n"

    # Identify missing or erroneous data
    explanation += "Missing Values:\n"
    explanation += str(dataframe.isnull().sum()) + "\n\n"

    return explanation

# Call the explore_dataset function
exploration_string = explore_dataset(pdf)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
def explore_dataset(dataframe):
    explanation = ""

    # Dataset structure, features, and target variable
    explanation += "Dataset Structure:\n"
    explanation += str(dataframe.info()) + "\n\n"

    # Summary statistics of numerical features
    explanation += "Summary Statistics of Numerical Features:\n"
    explanation += str(dataframe.describe()) + "\n\n"

    # Summary statistics of categorical features
    explanation += "Summary Statistics of Categorical Features:\n"
    explanation += str(dataframe.describe(include=['O'])) + "\n\n"

    # Unique values for categorical features
    explanation += "Unique Values of Categorical Features:\n"
    for column in dataframe.select_dtypes(include='object').columns:
        explanation += f"{column}: {dataframe[column].unique()}\n"

    # Identify missing or erroneous data
    explanation += "Missing Values:\n"
    explanation += str(dataframe.isnull().sum()) + "\n\n"

    return explanation

# Call the explore_dataset function
exploration_string = explore_dataset(pdf)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [39]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd

def preprocess_data(dataframe):
    # Handle missing data through imputation or removal for numerical features
    numeric_features = dataframe.select_dtypes(include=['int64', 'float64']).columns
    numeric_imputer = SimpleImputer(strategy='median')
    dataframe[numeric_features] = numeric_imputer.fit_transform(dataframe[numeric_features])

    # Handle missing data and perform one-hot encoding for categorical features
    categorical_features = dataframe.select_dtypes(include=['object']).columns
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    dataframe[categorical_features] = categorical_imputer.fit_transform(dataframe[categorical_features])
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoded_data = encoder.fit_transform(dataframe[categorical_features])
    encoded_columns = encoder.get_feature_names_out(categorical_features)
    encoded_dataframe = pd.DataFrame(encoded_data.toarray(), columns=encoded_columns)

    # Combine numerical and categorical features
    preprocessed_data = pd.concat([dataframe.drop(columns=categorical_features), encoded_dataframe], axis=1)

    return preprocessed_data

# Call the preprocess_data function
preprocessed_data = preprocess_data(pdf)

In [40]:
assert type(preprocessed_data) == pd.DataFrame

In [41]:
assert len(pdf.columns.tolist()) <= len(preprocessed_data.columns.tolist())

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

def initialize_classification_models():
    """
    Initialize classification machine learning models suitable for the Titanic dataset.

    Returns:
        models (list): List of initialized classification machine learning models.
    """
    models = [
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        SVC(),
        LogisticRegression(),
        KNeighborsClassifier(),
        MLPClassifier()
    ]

    return models

# Example usage:
models = initialize_classification_models()

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data.drop(columns=['Survived']), preprocessed_data['Survived'], test_size=0.3, random_state=42)

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

def train_models(X_train, y_train, models):
    """
    Train machine learning models using grid search with cross-validation to find the best hyperparameters.

    Parameters:
        X_train (DataFrame): Input features for training.
        y_train (DataFrame): Target variable for training.
        models (list): List of machine learning models to train.

    Returns:
        best_model (model): Best trained machine learning model.
        best_params (dict): Best hyperparameters found for the best model.
    """
    best_score = 0
    best_model = None
    best_params = None

    for model in models:
        if model == RandomForestClassifier:
            param_grid = {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
        elif model == SVC:
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['linear', 'rbf', 'poly'],
                'gamma': ['scale', 'auto']
            }
        elif model == LogisticRegression:
            param_grid = {
                'C': [0.1, 1, 10],
                'solver': ['liblinear', 'lbfgs'],
                'max_iter': [100, 200, 500]
            }
        else:
            param_grid = {}

        # Grid search with cross-validation
        grid_search = GridSearchCV(estimator=model(), param_grid=param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)

        # Check if current model has better performance
        if grid_search.best_score_ > best_score:
            best_score = grid_search.best_score_
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_

    return best_model, best_params

# Define the list of machine learning models
classification_models = [RandomForestClassifier, SVC, LogisticRegression]

# Call the train_models function
best_model, best_params = train_models(X_train, y_train, classification_models)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [54]:
from sklearn.metrics import accuracy_score

def evaluate_model(best_model, X_test, y_test):
    """
    Evaluate the best model using accuracy metric on the test set.

    Parameters:
        best_model (model): Trained machine learning model.
        X_test (DataFrame): Input features for testing.
        y_test (DataFrame): Target variable for testing.

    Returns:
        accuracy (float): Accuracy score of the best model on the test set.
    """
    # Predict target variable on test set
    y_pred = best_model.predict(X_test)

    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Call the evaluate_model function
test_accuracy = evaluate_model(best_model, X_test, y_test)

In [55]:
test_accuracy

0.8022388059701493

In [56]:
test_pdf = pd.read_csv('./data/titanic/test.csv')
preprocess_test_data = preprocess_data(test_pdf)
y_pred = best_model.predict(preprocess_test_data)
test_pdf['Survived'] = y_pred
test_pdf[['PassengerId','Survived']].to_csv('submission.csv',index=False)

Feature names unseen at fit time:
- Cabin_A11
- Cabin_A18
- Cabin_A21
- Cabin_A29
- Cabin_A9
- ...
Feature names seen at fit time, yet now missing:
- Cabin_A10
- Cabin_A14
- Cabin_A16
- Cabin_A19
- Cabin_A20
- ...



ValueError: X has 868 features, but RandomForestClassifier is expecting 1730 features as input.