In [1]:
# NumPy
import numpy as np # type: ignore

# Dataframe operations
import pandas as pd # type: ignore

# Data visualization
import seaborn as sns # type: ignore
import matplotlib.pyplot as plt # type: ignore

# Scalers
from sklearn.preprocessing import MinMaxScaler # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore
from sklearn.utils import shuffle # type: ignore

# Models
from sklearn.linear_model import LogisticRegression #logistic regression # type: ignore
from sklearn.ensemble import RandomForestClassifier #Random Forest # type: ignore
from sklearn.neighbors import KNeighborsClassifier #KNN # type: ignore
from sklearn.neural_network import MLPClassifier # type: ignore
from sklearn.naive_bayes import GaussianNB #Naive bayes # type: ignore
from sklearn.tree import DecisionTreeClassifier #Decision Tree # type: ignore
from sklearn import svm #support vector Machine # type: ignore
from sklearn.linear_model import Perceptron # type: ignore
from sklearn.ensemble import GradientBoostingClassifier # type: ignore

from sklearn.metrics import confusion_matrix #for confusion matrix # type: ignore
from sklearn.model_selection import train_test_split #training and testing data split# type: ignore
from sklearn import metrics #accuracy measure # type: ignore
from sklearn.ensemble import VotingClassifier # type: ignore

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation # type: ignore
from sklearn.model_selection import cross_val_score #score evaluation # type: ignore
from sklearn.model_selection import cross_val_predict #prediction # type: ignore
from sklearn.model_selection import cross_validate # type: ignore

# GridSearchCV
from sklearn.model_selection import GridSearchCV # type: ignore

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process # type: ignore

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder # type: ignore
from sklearn import feature_selection # type: ignore
from sklearn import model_selection # type: ignore
from sklearn import metrics # type: ignore

#Visualization
import matplotlib as mpl # type: ignore
import matplotlib.pyplot as plt # type: ignore
import matplotlib.pylab as pylab # type: ignore
import seaborn as sns # type: ignore
from pandas.plotting import scatter_matrix # type: ignore

from sklearn.preprocessing import LabelEncoder # type: ignore
from sklearn.preprocessing import MinMaxScaler # type: ignore
from sklearn.metrics import accuracy_score # type: ignore
from sklearn.model_selection import train_test_split # type: ignore

from sklearn.impute import SimpleImputer # type: ignore
from sklearn.svm import SVC # type: ignore
import os # type: ignore

In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
data_df = pd.concat([train_df, test_df], ignore_index=True)
data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [3]:
# Extract Title from Name
data_df['Title'] = data_df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

# Map rare titles to more common ones
mapping = {
    'Mlle': 'Miss', 'Mme': 'Mrs', 'Ms': 'Miss', 'Dr': 'Dr',
    'Major': 'Mr', 'Lady': 'Mrs', 'Sir': 'Mr', 'Col': 'Mr',
    'Capt': 'Mr', 'Countess': 'Mrs', 'Jonkheer': 'Mr',
    'Dona': 'Mrs', 'Don': 'Mr', 'Rev': 'Rev', 'Master': 'Master',
    'Miss': 'Miss', 'Mr': 'Mr', 'Mrs': 'Mrs'
}
data_df['Title'] = data_df['Title'].map(mapping)

# Fill missing Age based on Title median
age_by_title = data_df.groupby('Title')['Age'].median()
data_df['Age'] = data_df.apply(lambda row: age_by_title[row['Title']] if pd.isnull(row['Age']) else row['Age'], axis=1)

# Calculate Family_Size and re-split data_df using .loc to avoid SettingWithCopyWarning
data_df['Family_Size'] = data_df['Parch'] + data_df['SibSp']

data_df.drop('Title', axis=1, inplace=True)

train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

# Check for missing values in the dataset
missing_values = data_df.isnull().sum()
print(missing_values)


PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Family_Size       0
dtype: int64


In [4]:
data_df['Last_Name'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])
data_df['Fare'] = data_df['Fare'].fillna(data_df['Fare'].median())

DEFAULT_SURVIVAL_VALUE = 0.25
data_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in data_df[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

tolerance = 1e-10
filtered_data_df = data_df[abs(data_df['Family_Survival'] - 0.25) > tolerance]
print("Number of passengers with family survival information:", filtered_data_df.shape[0])


Number of passengers with family survival information: 420


In [5]:
for _, grp_df in data_df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (np.isclose(row['Family_Survival'], 0.25, rtol=1e-09, atol=1e-09)):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
# Correcting the line to avoid the ValueError
print("Number of passengers with family/group survival information:", 
      data_df.loc[~np.isclose(data_df['Family_Survival'], 0.25, rtol=1e-09, atol=1e-09)].shape[0])

train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

Number of passengers with family/group survival information: 546


In [6]:
# Use the assignment directly instead of inplace=True
data_df['Fare'] = data_df['Fare'].fillna(data_df['Fare'].median())

# Making Bins
data_df['FareBin'] = pd.qcut(data_df['Fare'], 5)

label = LabelEncoder()
data_df['FareBin_Code'] = label.fit_transform(data_df['FareBin'])

# Drop the original 'Fare' column as it's now represented by 'FareBin_Code'
data_df.drop(columns=['Fare', 'FareBin'], inplace=True)

train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

# Display the first few rows of the updated training DataFrame to verify the changes
print(train_df.head())


   PassengerId  Survived  Pclass  \
0            1       0.0       3   
1            2       1.0       1   
2            3       1.0       3   
3            4       1.0       1   
4            5       0.0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket Cabin Embarked  Family_Size  Last_Name  \
0      0         A/5 21171   NaN        S            1     Braund   
1      0          PC 17599   C85        C            1    Cumings   
2      0  STON/O2. 3101282   NaN        S            0  Heikkinen   
3      0            113803  C123        S   

In [7]:
# Grouping Age in 4 Bins
data_df['AgeBin'] = pd.qcut(data_df['Age'], 4)

label = LabelEncoder()
data_df['AgeBin_Code'] = label.fit_transform(data_df['AgeBin'])

data_df.drop(columns=['AgeBin', 'Age'], inplace=True)

train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

In [8]:
test_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Embarked,Family_Size,Last_Name,Family_Survival,FareBin_Code,AgeBin_Code
891,892,,3,"Kelly, Mr. James",male,0,0,330911,,Q,0,Kelly,0.25,0,2
892,893,,3,"Wilkes, Mrs. James (Ellen Needs)",female,1,0,363272,,S,1,Wilkes,0.25,0,3
893,894,,2,"Myles, Mr. Thomas Francis",male,0,0,240276,,Q,0,Myles,0.25,1,3
894,895,,3,"Wirz, Mr. Albert",male,0,0,315154,,S,0,Wirz,0.25,1,1
895,896,,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,1,1,3101298,,S,2,Hirvonen,1.0,2,0


In [9]:

sex_mapping = {'female':0, 'male':1}

data_df['Sex'] = data_df['Sex'].map(sex_mapping).astype(int)

# Set 'Cabin' to 0 if it's NaN, else set it to 1
data_df['Cabin'] = data_df['Cabin'].isna().astype(int)

# Drop rows where 'Sex' is NaN
data_df.dropna(subset=['Sex'], inplace=True)

# Drop unnecessary columns from both datasets
columns_to_drop = ['SibSp', 'Parch', 'Last_Name']
data_df.drop(columns=columns_to_drop, inplace=True)

train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

# Display the first few rows of the training dataframe
print(train_df.head(3))
print(test_df.head(3))


   PassengerId  Survived  Pclass  \
0            1       0.0       3   
1            2       1.0       1   
2            3       1.0       3   

                                                Name  Sex            Ticket  \
0                            Braund, Mr. Owen Harris    1         A/5 21171   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0          PC 17599   
2                             Heikkinen, Miss. Laina    0  STON/O2. 3101282   

   Cabin Embarked  Family_Size  Family_Survival  FareBin_Code  AgeBin_Code  
0      1        S            1             0.25             0            0  
1      0        C            1             0.25             4            3  
2      1        S            0             0.25             1            1  
     PassengerId  Survived  Pclass                              Name  Sex  \
891          892       NaN       3                  Kelly, Mr. James    1   
892          893       NaN       3  Wilkes, Mrs. James (Ellen Needs)    0   

In [10]:
# If 'Embarked' has missing values, fill them with the most common value
if data_df['Embarked'].isna().sum() > 0:
    common_value = data_df['Embarked'].mode()[0]
    data_df['Embarked'] = data_df['Embarked'].fillna(common_value)

# Define a dictionary to map 'Embarked' values to integers
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}

# Use the map method to convert 'Embarked' values to integers
data_df['Embarked'] = data_df['Embarked'].map(embarked_mapping).astype(int)

train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

print(test_df.head(5))

     PassengerId  Survived  Pclass  \
891          892       NaN       3   
892          893       NaN       3   
893          894       NaN       2   
894          895       NaN       3   
895          896       NaN       3   

                                             Name  Sex   Ticket  Cabin  \
891                              Kelly, Mr. James    1   330911      1   
892              Wilkes, Mrs. James (Ellen Needs)    0   363272      1   
893                     Myles, Mr. Thomas Francis    1   240276      1   
894                              Wirz, Mr. Albert    1   315154      1   
895  Hirvonen, Mrs. Alexander (Helga E Lindqvist)    0  3101298      1   

     Embarked  Family_Size  Family_Survival  FareBin_Code  AgeBin_Code  
891         2            0             0.25             0            2  
892         0            1             0.25             0            3  
893         2            0             0.25             1            3  
894         0            0        

In [11]:
data_df['Ticket'] = data_df['Ticket'].apply(lambda x: 1 if x.isnumeric() else 0)

train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()
print(train_df.head(3))

   PassengerId  Survived  Pclass  \
0            1       0.0       3   
1            2       1.0       1   
2            3       1.0       3   

                                                Name  Sex  Ticket  Cabin  \
0                            Braund, Mr. Owen Harris    1       0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0       0      0   
2                             Heikkinen, Miss. Laina    0       0      1   

   Embarked  Family_Size  Family_Survival  FareBin_Code  AgeBin_Code  
0         0            1             0.25             0            0  
1         1            1             0.25             4            3  
2         0            0             0.25             1            1  


In [12]:
# Set lower surviving rate titles to 1, others to 0
data_df['Name'] = data_df['Name'].str.contains('Mr. ').astype(int)

# Splitting into train and test DataFrames
train_df = data_df.loc[:890].copy()
test_df = data_df.loc[891:].copy()

# Now your code should work without the IntCastingNaNError
print(train_df.head(5))
print(test_df.head(5))

   PassengerId  Survived  Pclass  Name  Sex  Ticket  Cabin  Embarked  \
0            1       0.0       3     1    1       0      1         0   
1            2       1.0       1     0    0       0      0         1   
2            3       1.0       3     0    0       0      1         0   
3            4       1.0       1     0    0       1      0         0   
4            5       0.0       3     1    1       1      1         0   

   Family_Size  Family_Survival  FareBin_Code  AgeBin_Code  
0            1             0.25             0            0  
1            1             0.25             4            3  
2            0             0.25             1            1  
3            1             0.00             4            2  
4            0             0.25             1            2  
     PassengerId  Survived  Pclass  Name  Sex  Ticket  Cabin  Embarked  \
891          892       NaN       3     1    1       1      1         2   
892          893       NaN       3     0    0       1

In [13]:
# drop survived from X
X = train_df.drop('Survived', axis=1)
# y has to have only 'PassengerId', 'Survived' columns
y = train_df['Survived'].astype(int)
X_test = test_df.copy().drop('Survived', axis=1)
print("X.shape", X.shape,"y.shape", y.shape, "X_test.shape", X_test.shape)

X.shape (891, 11) y.shape (891,) X_test.shape (418, 11)


In [14]:
print("X", X.head())
print("X_test", X_test.head())


X    PassengerId  Pclass  Name  Sex  Ticket  Cabin  Embarked  Family_Size  \
0            1       3     1    1       0      1         0            1   
1            2       1     0    0       0      0         1            1   
2            3       3     0    0       0      1         0            0   
3            4       1     0    0       1      0         0            1   
4            5       3     1    1       1      1         0            0   

   Family_Survival  FareBin_Code  AgeBin_Code  
0             0.25             0            0  
1             0.25             4            3  
2             0.25             1            1  
3             0.00             4            2  
4             0.25             1            2  
X_test      PassengerId  Pclass  Name  Sex  Ticket  Cabin  Embarked  Family_Size  \
891          892       3     1    1       1      1         2            0   
892          893       3     0    0       1      1         0            1   
893          894     

In [15]:
# Initialize StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [16]:
# Define the hyperparameters for the SVC
svc_hyperparams = {
    'C': [0.1, 1, 10, 100], # Regularization parameter
    'kernel': ['linear', 'rbf'], # Kernel function
    'gamma': ['scale', 'auto'], # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    'class_weight': ['balanced', None] # Weights associated with classes
}

# Initialize the GridSearchCV with the SVC estimator
svc_gd = GridSearchCV(estimator=SVC(probability=True), param_grid=svc_hyperparams, verbose=True, cv=10, scoring="accuracy")

# Fit the model to the scaled training data
svc_gd.fit(X_scaled, y)

# Print the best score and the best estimator for SVC
print("SVC Best Score:", svc_gd.best_score_)
print("SVC Best Estimator:", svc_gd.best_estimator_)

# ************************************************************************************************************************************************************

# Define the hyperparameters for the RandomForestClassifier
rf_hyperparams = {
    'n_estimators': [100, 200], # Number of trees in the forest
    'max_depth': [None, 10, 20], # Maximum depth of the tree
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4] # Minimum number of samples required to be at a leaf node
}

# Initialize the GridSearchCV with the RandomForestClassifier estimator
rf_gd = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_hyperparams, verbose=True, cv=10, scoring="accuracy")

# Fit the model to the scaled training data
rf_gd.fit(X_scaled, y)

# Print the best score and the best estimator for RandomForestClassifier
print("RandomForestClassifier Best Score:", rf_gd.best_score_)
print("RandomForestClassifier Best Estimator:", rf_gd.best_estimator_)

# ************************************************************************************************************************************************************


# Define the hyperparameters for the KNeighborsClassifier
knn_hyperparams = {
    'n_neighbors': [3, 5, 7], # Number of neighbors to use
    'weights': ['uniform', 'distance'], # Weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], # Algorithm used to compute the nearest neighbors
    'p': [1, 2] # Power parameter for the Minkowski metric
}

# Initialize the GridSearchCV with the KNeighborsClassifier estimator
knn_gd = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_hyperparams, verbose=True, cv=10, scoring="accuracy")

# Fit the model to the scaled training data
knn_gd.fit(X_scaled, y)

# Print the best score and the best estimator for KNeighborsClassifier
print("KNeighborsClassifier Best Score:", knn_gd.best_score_)
print("KNeighborsClassifier Best Estimator:", knn_gd.best_estimator_)


# ************************************************************************************************************************************************************

# Define the hyperparameters for the LogisticRegression

lr_hyperparams = {
    'penalty': ['l2'],  # 'l1' removed due to compatibility issues with the default solver 'lbfgs'
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear'],  # Added 'liblinear' solver for compatibility with 'l1' penalty
    'class_weight': ['balanced', None]
}

# Initialize the GridSearchCV with the LogisticRegression estimator (updated)
lr_gd = GridSearchCV(estimator=LogisticRegression(), param_grid=lr_hyperparams, verbose=True, cv=10, scoring="accuracy")

# Fit the model to the scaled training data
lr_gd.fit(X_scaled, y)

# Print the best score and the best estimator for LogisticRegression
print("LogisticRegression Best Score:", lr_gd.best_score_)
print("LogisticRegression Best Estimator:", lr_gd.best_estimator_)


# ************************************************************************************************************************************************************

# Define the hyperparameters for the MLPClassifier
mlp_hyperparams = {
    'hidden_layer_sizes': [(90, 90)], # The ith element represents the number of neurons in the ith hidden layer
    'activation': ['identity', 'logistic', 'tanh', 'relu'], # Activation function for the hidden layer
    'solver': ['adam'], # Optimizer for weight optimization
    'alpha': [0.05], # L2 penalty (regularization term) parameter
    'learning_rate': ['adaptive'], # Learning rate schedule for weight updates
    'max_iter': [1000] # Maximum number of iterations
}

# Initialize the GridSearchCV with the MLPClassifier estimator
mlp_gd = GridSearchCV(estimator=MLPClassifier(), param_grid=mlp_hyperparams, verbose=True, cv=2, scoring="accuracy")

# Fit the model to the scaled training data
mlp_gd.fit(X_scaled, y)

# Print the best score and the best estimator for MLPClassifier
print("MLPClassifier Best Score:", mlp_gd.best_score_)
print("MLPClassifier Best Estimator:", mlp_gd.best_estimator_)

# ************************************************************************************************************************************************************



# Initialize the VotingClassifier with the best estimators from GridSearchCV
voting_clf = VotingClassifier(estimators=[
    ('best_svc', svc_gd.best_estimator_),
    ('best_rf', rf_gd.best_estimator_),
    ('best_knn', knn_gd.best_estimator_),
    ('best_lr', lr_gd.best_estimator_),
    ('best_mlp', mlp_gd.best_estimator_),

], voting='soft')

# Fit the VotingClassifier to the scaled training data
voting_clf.fit(X_scaled, y)

# Predict the test data with the VotingClassifier
y_pred = voting_clf.predict(X_test_scaled)

# Prepare the submission dataframe
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred
})

# Ensure the output directory exists
output_dir = '../data'
os.makedirs(output_dir, exist_ok=True)

# Save the submission file
submission.to_csv(f'{output_dir}/gender_submission.csv', index=False)
print("Submission saved successfully!")
print("y_pred", y_pred)


Fitting 10 folds for each of 32 candidates, totalling 320 fits
SVC Best Score: 0.8529463171036206
SVC Best Estimator: SVC(C=10, gamma='auto', probability=True)
Fitting 10 folds for each of 54 candidates, totalling 540 fits
RandomForestClassifier Best Score: 0.8540449438202249
RandomForestClassifier Best Estimator: RandomForestClassifier(max_depth=10, min_samples_leaf=2)
Fitting 10 folds for each of 48 candidates, totalling 480 fits
KNeighborsClassifier Best Score: 0.8316229712858926
KNeighborsClassifier Best Estimator: KNeighborsClassifier(n_neighbors=7, p=1)
Fitting 10 folds for each of 8 candidates, totalling 80 fits
LogisticRegression Best Score: 0.8316479400749065
LogisticRegression Best Estimator: LogisticRegression(C=0.1, solver='liblinear')
Fitting 2 folds for each of 4 candidates, totalling 8 fits
MLPClassifier Best Score: 0.8204438958028921
MLPClassifier Best Estimator: MLPClassifier(activation='logistic', alpha=0.05, hidden_layer_sizes=(90, 90),
              learning_rate='a