### Import packages

In [1]:
import math
from scipy.stats import ttest_ind

import pandas as pd
import numpy as np
import copy

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Creating functions

In [33]:
def load_data_set(data = 'train'):
    """_summary_
    This function reads the csv that contains the training data or test data for the project
    Args:
        data (str, optional): _description_. Defaults to 'train'.

    Returns:
        _type_: _description_
    """
    return pd.read_csv(f'../Data/{data}.csv')

def find_closest_cabin(fare_value, means_dataset):
    """_summary_
    This function finds the closest class in a class column using the mean value of other given column (fare in this case) and a given Fare value to compare.
    Args:
        fare_value (_type_): _description_
        means_dataset (_type_): _description_

    Returns:
        _type_: _description_
    """
    closest_letter = min(means_dataset['Fare'].keys(), key=lambda x: abs(means_dataset['Fare'][x] - fare_value))
    return closest_letter

def imput_age_by_pclass_and_sibsp(row_sibps_and_pclass, grouped_ages):
    """_summary_
    This function finds the mean/median or other metric value for the age grouped by SibSp and Pclass and imputs it into a DataFrame given a SibSp and a Pclass
    Args:
        row_sibps_and_pclass (_type_): _description_
        grouped_ages (_type_): _description_

    Returns:
        _type_: _description_
    """
    age_to_imput = grouped_ages[(grouped_ages['SibSp']== row_sibps_and_pclass['SibSp'])
                                &(grouped_ages['Pclass']== row_sibps_and_pclass['Pclass'])]['Age'].values[0]
    if not age_to_imput > 0:
        age_to_imput = grouped_ages['Age'].mean()
    return age_to_imput

def imput_fare_by_pclass(row_pclass, grouped_fares):
    """_summary_
    This function finds the mean/median or other metric value for the Fare grouped by Pclass and imputs it into a DataFrame given a Pclass
    Args:
        row_pclass (_type_): _description_
        grouped_fares (_type_): _description_

    Returns:
        _type_: _description_
    """
    age_to_imput = grouped_fares[(grouped_fares['Pclass']== row_pclass['Pclass'])]['Fare'].values[0]
    if not age_to_imput > 0:
        age_to_imput = grouped_fares['Fare'].mean()
    return age_to_imput

def feature_imputation(database):
    """_summary_
    
    This function unifies the imputation process for the features Cabin, Embarked and Age.
    
    Args:
        database (_type_): _description_
    """
    database['cabin_letter'] = database.apply(lambda x: x['Cabin'] if pd.isnull(x['Cabin']) else str(x['Cabin'])[0], axis = 1)

    mean_fares_by_cabin = database[database['Cabin'].notnull()][['Fare','cabin_letter']].groupby('cabin_letter').mean().sort_values(by = 'Fare')
    median_fares_by_cabin = database[database['Cabin'].notnull()][['Fare','cabin_letter']].groupby('cabin_letter').median().sort_values(by = 'Fare')
    database['imputed_cabin_letter_by_mean'] = database['cabin_letter'].fillna(database['Fare'].apply(lambda x: find_closest_cabin(x, mean_fares_by_cabin)))
    database['imputed_cabin_letter_by_median'] = database['cabin_letter'].fillna(database['Fare'].apply(lambda x: find_closest_cabin(x, median_fares_by_cabin)))

    mean_fares_by_port = database[database['Cabin'].notnull()][['Fare','Embarked']].groupby('Embarked').mean().sort_values(by = 'Fare')
    median_fares_by_port = database[database['Cabin'].notnull()][['Fare','Embarked']].groupby('Embarked').median().sort_values(by = 'Fare')
    database['imputed_Embarked_by_mean'] = database['Embarked'].fillna(database['Fare'].apply(lambda x: find_closest_cabin(x, mean_fares_by_port)))
    database['imputed_Embarked_by_median'] = database['Embarked'].fillna(database['Fare'].apply(lambda x: find_closest_cabin(x, median_fares_by_port)))

    mean_ages_grouped = database[['Age', 'SibSp', 'Pclass']].groupby(['SibSp', 'Pclass']).mean().reset_index()
    median_ages_grouped = database[['Age', 'SibSp', 'Pclass']].groupby(['SibSp', 'Pclass']).median().reset_index()
    database['imputed_Age_by_mean'] = database['Age'].fillna(database.apply(lambda x: imput_age_by_pclass_and_sibsp(x, mean_ages_grouped), axis = 1))
    database['imputed_Age_by_median'] = database['Age'].fillna(database.apply(lambda x: imput_age_by_pclass_and_sibsp(x, median_ages_grouped), axis = 1))
    
    mean_fares_grouped = database[['Fare', 'Pclass']].groupby(['Pclass']).mean().reset_index()
    median_fares_grouped = database[['Fare', 'Pclass']].groupby(['Pclass']).median().reset_index()
    database['imputed_Fare_by_mean'] = database['Fare'].fillna(database.apply(lambda x: imput_fare_by_pclass(x, mean_fares_grouped), axis = 1))
    database['imputed_Fare_by_median'] = database['Fare'].fillna(database.apply(lambda x: imput_fare_by_pclass(x, median_fares_grouped), axis = 1))
    
    return database

def feature_one_hot_encoding(database, columns_to_process):
    """_summary_
    This function creates multiple columns based on the categorical variables listed in the
    column_to_process parameter, where each column splits in n columns, having n as the 
    number of categories in the respective column.
    Args:
        database (_type_): _description_
        columns_to_process (_type_): _description_

    Returns:
        _type_: _description_
    """
    one_hot_db = pd.DataFrame()

    for column in columns_to_process:
        temp_one_hot_db = pd.get_dummies(database[column], prefix= column + '_')
        one_hot_db = pd.concat([one_hot_db, temp_one_hot_db], axis = 1)
        
    database = pd.concat([database, one_hot_db], axis = 1)
    
    return database

def feature_creation(database):
    """_summary_
    This function creates new features using the existent features (original or imputed), combining them,
    separating them or doing other processes.
    Args:
        database (_type_): _description_
    """
    database['surname'] = database['Name'].apply(lambda x: x.split(',')[0])
    #database['age_in_months'] = database['Age']*12
    database['imputed_age_in_months_by_mean'] = database['imputed_Age_by_mean']*12
    database['imputed_age_in_months_by_median'] = database['imputed_Age_by_median']*12
    database['family_members'] = database['SibSp'] + database['Parch']
    
    return database

def feature_scalation(database, columns_to_scale_and_transform):
    """_summary_
    This function scales the numerical features given in the columns_to_scale_and_transform list. It also
    gives the result of the column applying a logaritmit transformation
    Args:
        database (_type_): _description_
        columns_to_scale_and_transform (_type_): _description_

    Returns:
        _type_: _description_
    """
    variables_result = {}
    for column in columns_to_scale_and_transform:
        variables_result['scaler_for_' + column] = StandardScaler().fit(database[column].to_numpy().reshape(-1, 1))
        variables_result['scaled_' + column] = variables_result['scaler_for_' + column].transform(database[column].to_numpy().reshape(-1, 1))
        variables_result['log_' + column] = np.log(database[column])
        
        database = pd.concat([database, pd.DataFrame(variables_result['scaled_' + column], columns = ['scaled_' + column])], axis = 1)
        database = pd.concat([database, variables_result['log_' + column].rename("log_" + column)], axis = 1)

    return database

def feature_reduction(database, dimensions_to_reduce):
    
    pca = PCA(n_components = 2)
    pca.fit(database[dimensions_to_reduce])
    reduced_dimensions = pd.DataFrame(pca.transform(database[dimensions_to_reduce]), columns = ['First_component', 'Second_component'])
    
    return reduced_dimensions

def base_consolidation(database, reduced_dimensions, columns_to_discard_in_train, dimensions_to_reduce):

    X_train_set = database[[col for col in database.columns if col not in columns_to_discard_in_train + dimensions_to_reduce]]
    X_train_set = pd.concat([X_train_set, reduced_dimensions], axis = 1)
    try:
        Y_train_set = X_train_set.pop('Survived')
    except:
        Y_train_set = None
    return X_train_set, Y_train_set

def model_training(X_train_set, Y_train_set):
    
    print("--Starting Logistic Regression GridSearchCV--")
    logistic_parameters = {'max_iter':[1000, 5000, 10000]}
    logistic_regression = LogisticRegression(random_state=0)
    logistic_regression_fitted = GridSearchCV(logistic_regression, logistic_parameters, verbose=4).fit(X_train_set, Y_train_set)
    logistic_score = logistic_regression_fitted.score(X_train_set, Y_train_set)

    print("--Starting Random Forest GridSearchCV--")
    random_forest_parameters = {'n_estimators':[10, 100, 500]
                                ,'min_samples_split':[2, 3, 4, 5, 6, 7]}
    random_forest = RandomForestClassifier(random_state=0)
    random_forest_fitted = GridSearchCV(random_forest, random_forest_parameters, verbose=4).fit(X_train_set, Y_train_set)
    random_forest_score = random_forest_fitted.score(X_train_set, Y_train_set)

    print("--Starting Gradient Boosting GridSearchCV--")
    gradient_boosting_parameters = {'n_estimators':[500, 1000]
                                    ,'learning_rate':[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
                                    ,'max_depth':[1, 2, 3]}
    gradient_boosting = GradientBoostingClassifier(random_state=0)
    gradient_boosting_fitted = GridSearchCV(gradient_boosting, gradient_boosting_parameters, verbose=4).fit(X_train_set, Y_train_set)
    gradient_boosting_score = gradient_boosting_fitted.score(X_train_set, Y_train_set)
    
    return logistic_regression_fitted, logistic_score, random_forest_fitted, random_forest_score, gradient_boosting_fitted, gradient_boosting_score

def adding_boolean_columns(train_dataset, test_dataset):
    
    cols_to_add = train_dataset[[col for col in train_dataset.columns if col not in test_dataset.columns]]
    for i, col in enumerate(cols_to_add.dtypes):
        if col == bool:
            test_dataset[cols_to_add.columns[i]] = False
    return test_dataset[train_dataset.columns]

def saving_dataset_to_upload(test_passeinger_id, test_dataset, models):
    
    temp = pd.DataFrame()
    for i, model in enumerate(models):
        temp[i] = model.predict(test_dataset)
    final_uploading_set = test_passeinger_id[['PassengerId']]
    final_uploading_set['Survived'] = temp.mean(axis = 1).round().astype(int)
    final_uploading_set.to_csv('../Data/final_submission.csv', index=False)
    return final_uploading_set

### Loading data

In [3]:
original_train_set = load_data_set()
original_test_set = load_data_set(data = 'test')

### EDA

 The percentage of nulls in the columns tell us that Age, Cabin and Embarked could be inferred. The Cabin could be difficult, but as it could be highly related with the Fare, something could be made

In [4]:
original_train_set.isnull().sum() / len(original_train_set) * 100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

Boxplots between some variables and Age, with the aim of finding relations to imput Age data

In [5]:
fig = make_subplots(
    rows=3, cols=2,
    specs=[[{}, {}],
           [{}, {}],
           [{"colspan": 2}, None]],
    subplot_titles=("Sex", "Embarked", "Pclass", "SibSp", "Parch")
    )
fig.add_trace(go.Box(x=original_train_set["Sex"], y=original_train_set["Age"], name="Age by Sex", boxpoints="all"), row = 1, col = 1)
fig.add_trace(go.Box(x=original_train_set["Embarked"], y=original_train_set["Age"], name="Age by Port", boxpoints="all"), row = 1, col = 2)
fig.add_trace(go.Box(x=original_train_set["Pclass"], y=original_train_set["Age"], name="Age by Pclass", boxpoints="all"), row = 2, col = 1)
fig.add_trace(go.Box(x=original_train_set["SibSp"], y=original_train_set["Age"], name="Age by SibSp", boxpoints="all"), row = 2, col = 2)
fig.add_trace(go.Box(x=original_train_set["Parch"], y=original_train_set["Age"], name="Age by Parch", boxpoints="all"), row = 3, col = 1)
fig.update_layout(height=700, showlegend=False, title_text="Age by different variables")
fig.show()

Correlations between Age and other variables to find relations

In [6]:
original_train_set[original_train_set.dtypes[(original_train_set.dtypes =='int64')|(original_train_set.dtypes =='float64')].index].corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [7]:
fig = px.imshow(original_train_set[original_train_set.dtypes[(original_train_set.dtypes =='int64')|(original_train_set.dtypes =='float64')].index].corr())
fig.show()

Given SibSp and Pclass the more related variables to Age, some boxplots are represented including the 3 variables

In [8]:
fig = px.box(original_train_set[original_train_set['Age'].isna()], y="SibSp", points="all", color = 'Pclass')
fig.show()

In [9]:
fig = px.box(original_train_set[original_train_set['SibSp']==0], y="Age", points="all", color = 'Pclass')
fig.show()

Boxplot of Embarked and Fare to see relations

In [10]:
fig = px.box(original_train_set, x="Embarked", y="Fare", points="all")
fig.show()

In [11]:
fig = px.box(original_train_set[original_train_set['Embarked'].isna()], y="Fare", points="all")
fig.show()

Applying The ideas from the dataset to create or fill columns:
* Surnames could be extract - But they're too many, so not used until necessary
* Age could be tourned into months.
* The cabins maybe related to Tickets, Fare or others.
* The port of embarcation could be inferred?
* The age could be imputed using SibSp and Pclass.

In [12]:
train_set = copy.deepcopy(original_train_set)
train_set['surname'] = train_set['Name'].apply(lambda x: x.split(',')[0])
train_set['cabin_letter'] = train_set.apply(lambda x: x['Cabin'] if pd.isnull(x['Cabin']) else str(x['Cabin'])[0], axis = 1)
mean_fares_by_cabin = train_set[train_set['Cabin'].notnull()][['Fare','cabin_letter']].groupby('cabin_letter').mean().sort_values(by = 'Fare')
median_fares_by_cabin = train_set[train_set['Cabin'].notnull()][['Fare','cabin_letter']].groupby('cabin_letter').median().sort_values(by = 'Fare')
train_set['imputed_cabin_letter_by_mean'] = train_set['cabin_letter'].fillna(train_set['Fare'].apply(lambda x: find_closest_cabin(x, mean_fares_by_cabin)))
train_set['imputed_cabin_letter_by_median'] = train_set['cabin_letter'].fillna(train_set['Fare'].apply(lambda x: find_closest_cabin(x, median_fares_by_cabin)))
mean_fares_by_port = train_set[train_set['Cabin'].notnull()][['Fare','Embarked']].groupby('Embarked').mean().sort_values(by = 'Fare')
median_fares_by_port = train_set[train_set['Cabin'].notnull()][['Fare','Embarked']].groupby('Embarked').median().sort_values(by = 'Fare')
train_set['imputed_Embarked_by_mean'] = train_set['Embarked'].fillna(train_set['Fare'].apply(lambda x: find_closest_cabin(x, mean_fares_by_port)))
train_set['imputed_Embarked_by_median'] = train_set['Embarked'].fillna(train_set['Fare'].apply(lambda x: find_closest_cabin(x, median_fares_by_port)))
mean_ages_grouped = train_set[['Age', 'SibSp', 'Pclass']].groupby(['SibSp', 'Pclass']).mean().reset_index()
median_ages_grouped = train_set[['Age', 'SibSp', 'Pclass']].groupby(['SibSp', 'Pclass']).median().reset_index()
train_set['imputed_Age_by_mean'] = train_set['Age'].fillna(train_set.apply(lambda x: imput_age_by_pclass_and_sibsp(x, mean_ages_grouped), axis = 1))
train_set['imputed_Age_by_median'] = train_set['Age'].fillna(train_set.apply(lambda x: imput_age_by_pclass_and_sibsp(x, median_ages_grouped), axis = 1))
mean_fares_grouped = train_set[['Fare', 'Pclass']].groupby(['Pclass']).mean().reset_index()
median_fares_grouped = train_set[['Fare', 'Pclass']].groupby(['Pclass']).median().reset_index()
train_set['imputed_Fare_by_mean'] = train_set['Fare'].fillna(train_set.apply(lambda x: imput_fare_by_pclass(x, mean_fares_grouped), axis = 1))
train_set['imputed_Fare_by_median'] = train_set['Fare'].fillna(train_set.apply(lambda x: imput_fare_by_pclass(x, median_fares_grouped), axis = 1))
train_set['age_in_months'] = train_set['Age']*12
train_set['imputed_age_in_months_by_mean'] = train_set['imputed_Age_by_mean']*12
train_set['imputed_age_in_months_by_median'] = train_set['imputed_Age_by_median']*12

Boxplot of cabin_letter and Fare to see relations

In [13]:
fig = px.box(train_set, x="cabin_letter", y="Fare", points="all")
fig.show()

### Relations between survived and numercial columns
Pclass, Sibsp, Parch and Fare seems to have relation with the Survived categorization, while Age just in the extreme cases.

In [14]:
fig = make_subplots(
    rows = int(len(train_set.dtypes[(train_set.dtypes =='int64')|(train_set.dtypes =='float64')])/2) - 1, cols=2,
    subplot_titles = ["Survived by "+col for col in train_set.dtypes[(train_set.dtypes =='int64')|(train_set.dtypes =='float64')].index if col not in ['PassengerId', 'Survived']])
height_per_row = 200
for idx, column in enumerate(train_set.dtypes[(train_set.dtypes =='int64')|(train_set.dtypes =='float64')].index):
    if column not in ['PassengerId', 'Survived']: 
        fig.add_trace(go.Box(x=train_set["Survived"], y=train_set[column], boxpoints="all"), row = math.floor(idx/2), col = idx%2 + 1)
fig.update_layout(height=height_per_row*(math.floor(idx/2) + 1), showlegend=False, title_text="Survived relations with numerical features")
fig.show()

### Relations between survived and non numercial columns
Sex, Embarked, cabin_letter and its imputations seems to have relation with Survived categorization

In [15]:
fig = make_subplots(
    rows = math.ceil(len(train_set.dtypes[(train_set.dtypes =='object')].index)/2)-2, cols=2,
    subplot_titles = ["Survived by "+ col for col in train_set.dtypes[(train_set.dtypes =='object')].index if col not in ['Name', 'Ticket', 'surname', 'Cabin']])
height_per_row = 200
idx = 0
for column in train_set.dtypes[(train_set.dtypes =='object')].index:
    grouped_train_set = train_set[['PassengerId', 'Survived', column]].groupby(['Survived', column]).count().reset_index()
    if column not in ['Name', 'Ticket', 'surname', 'Cabin']:
        survived_class = grouped_train_set['Survived'].unique()
        fig.add_trace(go.Bar(x=grouped_train_set[grouped_train_set['Survived']==survived_class[0]][column],
                             y=grouped_train_set[grouped_train_set['Survived']==survived_class[0]]['PassengerId'], 
                             name='Not Survived',
                             marker_color='Red',  
                             legendgroup = idx+1), row = math.floor(idx/2)+1, col = idx%2 + 1)
        fig.add_trace(go.Bar(x=grouped_train_set[grouped_train_set['Survived']==survived_class[1]][column],
                             y=grouped_train_set[grouped_train_set['Survived']==survived_class[1]]['PassengerId'],
                             name='Survived', 
                             marker_color='Blue',
                             legendgroup = idx+1), row = math.floor(idx/2)+1, col = idx%2 + 1)
        idx += 1
fig.update_layout(barmode='group', height=height_per_row*(math.floor(idx/2) + 1), title_text="Survived relations with categorical features", legend_tracegroupgap = height_per_row*(math.floor(idx/2) + 1))
fig.show()

### Numerical variables description after cleaning

In [16]:
train_set.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,imputed_Age_by_mean,imputed_Age_by_median,imputed_Fare_by_mean,imputed_Fare_by_median,age_in_months,imputed_age_in_months_by_mean,imputed_age_in_months_by_median
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,714.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,29.518326,29.247946,32.204208,32.204208,356.389412,354.219909,350.975354
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,13.246754,13.261699,49.693429,49.693429,174.317968,158.961045,159.140388
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.42,0.42,0.0,0.0,5.04,5.04,5.04
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,22.0,22.0,7.9104,7.9104,241.5,264.0,264.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,27.630201,26.0,14.4542,14.4542,336.0,331.56241,312.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,37.0,37.0,31.0,31.0,456.0,444.0,444.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,80.0,80.0,512.3292,512.3292,960.0,960.0,960.0


### Finding statistical differences between numerical variables
The only numerical variable that seems to have a difference between its mean by Survived category is Fare

In [17]:
list_of_columns_to_compare = ['Fare', 'imputed_Age_by_mean', 'imputed_Age_by_median', 'imputed_Fare_by_mean',	'imputed_Fare_by_median', 'imputed_age_in_months_by_mean', 'imputed_age_in_months_by_median']
for column_name in list_of_columns_to_compare:
    test_result = ttest_ind(train_set[train_set['Survived'] == 1][column_name], train_set[train_set['Survived'] == 0][column_name])
    if test_result.pvalue < 0.05:
        conclusion = f"there is difference for the mean {column_name} between those who survived and those who did not in the Titanic tragedy"
    else:
        conclusion = f"there is NO difference for the mean {column_name} between those who survived and those who did not in the Titanic tragedy"
        
    print(f"""The t-test two-sided test for the variable {column_name} to find differences between the survivors gave the following results:
           statistic: {test_result.statistic:.2f}
           pvalue: {test_result.pvalue:.4f}
           Given those results, the test allows to conclude that {conclusion}
           """ )

The t-test two-sided test for the variable Fare to find differences between the survivors gave the following results:
           statistic: 7.94
           pvalue: 0.0000
           Given those results, the test allows to conclude that there is difference for the mean Fare between those who survived and those who did not in the Titanic tragedy
           
The t-test two-sided test for the variable imputed_Age_by_mean to find differences between the survivors gave the following results:
           statistic: -1.63
           pvalue: 0.1042
           Given those results, the test allows to conclude that there is NO difference for the mean imputed_Age_by_mean between those who survived and those who did not in the Titanic tragedy
           
The t-test two-sided test for the variable imputed_Age_by_median to find differences between the survivors gave the following results:
           statistic: -1.44
           pvalue: 0.1501
           Given those results, the test allows to conclude t

### Creating new features throught feature engineering
### One hot encoding
As in general, classification models do not understand categories as one variable, a one hot encoding is made.

In [18]:
columns_to_onehot = ['Pclass', 'Sex', 'imputed_cabin_letter_by_mean', 'imputed_cabin_letter_by_median', 'imputed_Embarked_by_mean', 'imputed_Embarked_by_median']
one_hot_db = pd.DataFrame()

for column in columns_to_onehot:
    temp_one_hot_db = pd.get_dummies(train_set[column], prefix= column + '_')
    one_hot_db = pd.concat([one_hot_db, temp_one_hot_db], axis = 1)
    
train_set = pd.concat([train_set, one_hot_db], axis = 1)

### Combining features
In order to create new features, the Sipsp and Parch (like horizontal and vertical movements in the family tree) are unified to know the total family close persons in the Titanic

In [19]:
train_set['family_members'] = train_set['SibSp'] + train_set['Parch']

### Scaling and transforming columns with outliers
In order to improve the performance of the model, numerical variables are scaled to make them more comparable

In [20]:
columns_to_scale_and_transform = ["Fare", "imputed_Age_by_mean", "imputed_Age_by_median", "imputed_Fare_by_mean",	"imputed_Fare_by_median", "age_in_months", "imputed_age_in_months_by_mean", "imputed_age_in_months_by_median"]
variables_result = {}
for column in columns_to_scale_and_transform:
    variables_result['scaler_for_' + column] = StandardScaler().fit(train_set[column].to_numpy().reshape(-1, 1))
    variables_result['scaled_' + column] = variables_result['scaler_for_' + column].transform(train_set[column].to_numpy().reshape(-1, 1))
    variables_result['log_' + column] = np.log(train_set[column])
    
    train_set = pd.concat([train_set, pd.DataFrame(variables_result['scaled_' + column], columns = ['scaled_' + column])], axis = 1)
    train_set = pd.concat([train_set, variables_result['log_' + column].rename("log_" + column)], axis = 1)


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log



### PCA - Dimensionality reduction
After the EDA process, many numerical variables where created, each with different adjustments in their imputations and scalings, so a PCA is conducted in order to reduce the dimensionality created

In [21]:
dimensions_to_reduce = ['scaled_imputed_Age_by_mean', 'log_imputed_Age_by_mean',
                        'scaled_imputed_Age_by_median', 'log_imputed_Age_by_median', 'scaled_imputed_Fare_by_mean',
                        'scaled_imputed_Fare_by_median', 'scaled_imputed_age_in_months_by_mean',
                        'log_imputed_age_in_months_by_mean', 'scaled_imputed_age_in_months_by_median',
                        'log_imputed_age_in_months_by_median']
pca = PCA(n_components = 2)
pca.fit(train_set[dimensions_to_reduce])
reduced_dimensions = pd.DataFrame(pca.transform(train_set[dimensions_to_reduce]), columns = ['First_component', 'Second_component'])

In [22]:
fig = go.Figure(data=go.Scatter(x=reduced_dimensions['First_component'], y=reduced_dimensions['Second_component'], mode='markers', marker_color = train_set['Survived']))
fig.show()

Given the previous graph, it seems that the components don't predict as well the Survivor behaviour, but they help to reduce the number of numerical variables in the analysis

### Test NaN validation
As the train set could have different characteristics than the test set, a validation of NaN for the inputation process is made

In [23]:
original_test_set.isnull().sum() / len(original_test_set) * 100

PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.574163
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.239234
Cabin          78.229665
Embarked        0.000000
dtype: float64

In [24]:
fig = make_subplots(
    rows = int(len(original_test_set.dtypes[(original_test_set.dtypes =='int64')|(original_test_set.dtypes =='float64')])/2 -1), cols=2,
    subplot_titles = ["Fare by "+col for col in original_test_set.dtypes[(original_test_set.dtypes =='int64')|(original_test_set.dtypes =='float64')].index if col not in ['PassengerId', 'Fare']])
height_per_row = 200
for idx, column in enumerate([col for col in original_test_set.dtypes[(original_test_set.dtypes =='int64')|(original_test_set.dtypes =='float64')].index if col not in ['PassengerId', 'Fare']]):
    fig.add_trace(go.Box(x=original_test_set["Fare"], y=original_test_set[column], boxpoints="all"), row = math.floor(idx/2) + 1, col = idx%2 + 1)
fig.update_layout(height=height_per_row*(math.floor(idx/2) + 1), showlegend=False, title_text="Fare relations with numerical features")
fig.show()

### Model selection
First of all, a feature selection of the database is made, in order to select the features to use. Discarding the original features, or the imputed but not encoded, scaled or reduced.

In [25]:
columns_to_discard_in_train = ['PassengerId', 'Fare', 'Pclass','Name', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked',
    'surname', 'cabin_letter', 'imputed_cabin_letter_by_mean', 'scaled_Fare', 'imputed_cabin_letter_by_median',
    'imputed_Embarked_by_mean', 'imputed_Embarked_by_median', 'imputed_Age_by_mean', 'imputed_Age_by_median',
    'age_in_months', 'imputed_age_in_months_by_mean', 'imputed_age_in_months_by_median', 'log_Fare', 
    'log_imputed_Fare_by_mean', 'log_imputed_Fare_by_median', 'scaled_age_in_months', 'log_age_in_months']

X_train_set = train_set[[col for col in train_set.columns if col not in columns_to_discard_in_train + dimensions_to_reduce]]
X_train_set = pd.concat([X_train_set, reduced_dimensions], axis = 1)

Y_train_set = X_train_set.pop('Survived')

In [26]:
train_set = feature_imputation(original_train_set)
columns_to_onehot = ['Pclass', 'Sex', 'imputed_cabin_letter_by_mean', 'imputed_cabin_letter_by_median', 'imputed_Embarked_by_mean', 'imputed_Embarked_by_median']
train_set = feature_one_hot_encoding(train_set, columns_to_onehot)
train_set = feature_creation(train_set)
columns_to_scale_and_transform = ["Fare", "imputed_Age_by_mean", "imputed_Age_by_median", "imputed_Fare_by_mean", "imputed_Fare_by_median", "imputed_age_in_months_by_mean", "imputed_age_in_months_by_median"]
train_set = feature_scalation(train_set, columns_to_scale_and_transform)
dimensions_to_reduce = ['scaled_imputed_Age_by_mean', 'log_imputed_Age_by_mean',
                        'scaled_imputed_Age_by_median', 'log_imputed_Age_by_median', 'scaled_imputed_Fare_by_mean',
                        'scaled_imputed_Fare_by_median', 'scaled_imputed_age_in_months_by_mean',
                        'log_imputed_age_in_months_by_mean', 'scaled_imputed_age_in_months_by_median',
                        'log_imputed_age_in_months_by_median']
reduced_dimensions = feature_reduction(train_set, dimensions_to_reduce)
columns_to_discard_in_train = ['PassengerId', 'Fare', 'Pclass','Name', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked',
    'surname', 'cabin_letter', 'imputed_cabin_letter_by_mean', 'scaled_Fare', 'imputed_cabin_letter_by_median',
    'imputed_Embarked_by_mean', 'imputed_Embarked_by_median', 'imputed_Age_by_mean', 'imputed_Age_by_median',
    'age_in_months', 'imputed_age_in_months_by_mean', 'imputed_age_in_months_by_median', 'log_Fare', 
    'log_imputed_Fare_by_mean', 'log_imputed_Fare_by_median', 'scaled_age_in_months', 'log_age_in_months']
X_train_set, Y_train_set = base_consolidation(train_set, reduced_dimensions, columns_to_discard_in_train, dimensions_to_reduce)
logistic_regression, logistic_score, random_forest, random_forest_score, gradient_boosting, gradient_boosting_score = model_training(X_train_set, Y_train_set)


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log



--Starting Logistic Regression GridSearchCV--
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END .....................max_iter=1000;, score=0.799 total time=   0.2s
[CV 2/5] END .....................max_iter=1000;, score=0.803 total time=   0.1s
[CV 3/5] END .....................max_iter=1000;, score=0.798 total time=   0.2s
[CV 4/5] END .....................max_iter=1000;, score=0.798 total time=   0.1s
[CV 5/5] END .....................max_iter=1000;, score=0.826 total time=   0.1s
[CV 1/5] END .....................max_iter=5000;, score=0.799 total time=   0.1s
[CV 2/5] END .....................max_iter=5000;, score=0.803 total time=   0.1s
[CV 3/5] END .....................max_iter=5000;, score=0.798 total time=   0.2s
[CV 4/5] END .....................max_iter=5000;, score=0.798 total time=   0.1s
[CV 5/5] END .....................max_iter=5000;, score=0.826 total time=   0.1s
[CV 1/5] END ....................max_iter=10000;, score=0.799 total time=   0.1s
[CV

In [27]:
logistic_score, random_forest_score, gradient_boosting_score

(0.8215488215488216, 0.9371492704826038, 0.941638608305275)

In [36]:
test_set = feature_imputation(original_test_set)
columns_to_onehot = ['Pclass', 'Sex', 'imputed_cabin_letter_by_mean', 'imputed_cabin_letter_by_median', 'imputed_Embarked_by_mean', 'imputed_Embarked_by_median']
test_set = feature_one_hot_encoding(test_set, columns_to_onehot)
test_set = feature_creation(test_set)
columns_to_scale_and_transform = ["Fare", "imputed_Age_by_mean", "imputed_Age_by_median", "imputed_Fare_by_mean", "imputed_Fare_by_median", "imputed_age_in_months_by_mean", "imputed_age_in_months_by_median"]
test_set = feature_scalation(test_set, columns_to_scale_and_transform)
dimensions_to_reduce = ['scaled_imputed_Age_by_mean', 'log_imputed_Age_by_mean',
                        'scaled_imputed_Age_by_median', 'log_imputed_Age_by_median', 'scaled_imputed_Fare_by_mean',
                        'scaled_imputed_Fare_by_median', 'scaled_imputed_age_in_months_by_mean',
                        'log_imputed_age_in_months_by_mean', 'scaled_imputed_age_in_months_by_median',
                        'log_imputed_age_in_months_by_median']
reduced_dimensions = feature_reduction(test_set, dimensions_to_reduce)
columns_to_discard_in_test = ['PassengerId', 'Fare', 'Pclass','Name', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked',
    'surname', 'cabin_letter', 'imputed_cabin_letter_by_mean', 'scaled_Fare', 'imputed_cabin_letter_by_median',
    'imputed_Embarked_by_mean', 'imputed_Embarked_by_median', 'imputed_Age_by_mean', 'imputed_Age_by_median',
    'age_in_months', 'imputed_age_in_months_by_mean', 'imputed_age_in_months_by_median', 'log_Fare', 
    'log_imputed_Fare_by_mean', 'log_imputed_Fare_by_median', 'scaled_age_in_months', 'log_age_in_months']
X_test_set, _ = base_consolidation(test_set, reduced_dimensions, columns_to_discard_in_test, dimensions_to_reduce)
X_test_set = adding_boolean_columns(X_train_set, X_test_set)
final_uploading_set = saving_dataset_to_upload(test_set, X_test_set, [logistic_regression, random_forest, gradient_boosting])


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [37]:
final_uploading_set

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [50]:
gradient_boosting.best_params_

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 500}

In [32]:
gradient_boosting.cv_results_

{'mean_fit_time': array([ 0.84262171,  7.76978664, 76.02967095,  0.73358049,  7.34432683,
        73.31132193,  0.72825017,  7.31995206, 73.31691465,  0.7374033 ,
         7.31534243, 73.13789034,  0.73185587,  7.3276659 , 73.06332226]),
 'std_fit_time': array([1.13920203e-01, 1.00142401e-01, 2.34168112e+00, 1.76018740e-03,
        4.66981246e-02, 1.08969689e-01, 3.63928357e-03, 3.43247022e-02,
        2.87777737e-01, 5.33587301e-03, 2.71231551e-02, 1.55406951e-01,
        7.82489195e-03, 3.21088611e-02, 1.68122751e-01]),
 'mean_score_time': array([0.00360017, 0.01050286, 0.07739773, 0.00259948, 0.00979881,
        0.07727175, 0.0023993 , 0.00959883, 0.07560887, 0.00259948,
        0.0095984 , 0.07499881, 0.00273924, 0.00999899, 0.07419906]),
 'std_score_time': array([2.72750160e-03, 4.50784720e-04, 1.49547297e-03, 4.89882084e-04,
        4.01044441e-04, 2.77062659e-03, 4.89512341e-04, 4.90449366e-04,
        1.36635222e-03, 4.90076729e-04, 4.90388468e-04, 8.93402988e-04,
        3.880