<a href="https://colab.research.google.com/github/ItsMacto/ML-Models/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [148]:
from pathlib import Path
import pandas as pd

train = pd.read_csv('./datasets/titanic_dataset/train.csv')
test = pd.read_csv('./datasets/titanic_dataset/test.csv')

In [149]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [150]:
train.set_index('PassengerId', inplace=True)
test.set_index('PassengerId', inplace=True)

In [151]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [152]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


plan:
- handle nulls
- creating reletive mediams
- view data
- pre process
- feature engineer
- split x and y
- model
- hyperr prams

The attributes have the following meaning:
* **PassengerId**: a unique identifier for each passenger
* **Survived**: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
* **Pclass**: passenger class 1, 2, 3
* **Name**, **Sex**, **Age**: self-explanatory
* **SibSp**: how many siblings & spouses of the passenger aboard the Titanic.
* **Parch**: how many children & parents of the passenger aboard the Titanic.
* **Ticket**: ticket id
* **Fare**: price paid (in pounds)
* **Cabin**: passenger's cabin number
* **Embarked**: where the passenger embarked the Titanic

In [153]:
train['Ticket'].value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

In [174]:
def RelativesOnboard(df):
    df['RelativesOnboard'] = df['SibSp'] + df['Parch']
    return df

def AgeGroup(df):
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 18, 60, float('inf')], labels=['Child', 'Adult', 'Senior'])
    return df

# testRelativesOnboard = RelativesOnboard(train)
# testAgeGroup = AgeGroup(train)

# print(testRelativesOnboard[['RelativesOnboard', 'Survived']].groupby(['RelativesOnboard']).mean())
# print(testAgeGroup[['AgeGroup', 'Survived']].groupby(['AgeGroup']).mean())
# print(testAgeGroup[['AgeGroup', 'Survived','Sex']].groupby(['AgeGroup','Sex']).mean())

In [155]:
train = RelativesOnboard(train)
train = AgeGroup(train)

test = RelativesOnboard(test)
test = AgeGroup(test)

In [156]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

import numpy as np

class GroupedMedianImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_cols, target_col):
        self.group_cols = group_cols
        self.target_col = target_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Use groupby and transform to align the indices correctly
        X[self.target_col] = X[self.target_col].fillna(
            X.groupby(self.group_cols)[self.target_col].transform('median')
        )
        return X

num_attribs = ['Age', 'Pclass', 'SibSp', 'Parch', 'Fare','RelativesOnboard']
cat_attribs = ['Embarked', 'Sex','AgeGroup']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(sparse_output=False)),
])

preprocess_pipeline = Pipeline([
    ('age_imputer', GroupedMedianImputer(group_cols=['Sex'], target_col='Age')),
    ('column_transformer', ColumnTransformer([
        ('num', num_pipeline, num_attribs),
        ('cat', cat_pipeline, cat_attribs),
    ]))
])



# num_attribs = ['Age', 'Pclass', 'SibSp', 'Parch', 'Fare']
# cat_attribs = ['Embarked', 'Sex']

# num_pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler()),
# ])

# cat_pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('one_hot', OneHotEncoder(sparse_output=False)),
# ])

# preprocess_pipeline = Pipeline([
#     ('column_transformer', ColumnTransformer([
#         ('num', num_pipeline, num_attribs),
#         ('cat', cat_pipeline, cat_attribs),
#     ]))
# ])



In [157]:
#Split target and labels
X_train = preprocess_pipeline.fit_transform(train)
y_train = train['Survived']

In [158]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.80812734082397

0.817116104868914


Thats doing alright. Now lets try mutiple models with hyper pramater tuning

In [159]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

full_pipeline = Pipeline([
    ('preprocessor', preprocess_pipeline),
    ('classifier', None),
])

param_grid = [
    # Logistic Regression
    {
        'classifier': [LogisticRegression(max_iter=1000, random_state=42)],
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs'],
    },
    # Decision Tree
    {
        'classifier': [DecisionTreeClassifier(random_state=42)],
        'classifier__max_depth': [None, 5, 10],
        'classifier__min_samples_split': [2, 5, 10],
    },
    # Random Forest
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [100, 200, 500, 1000],
        'classifier__max_features': [3, 5, 10, 'sqrt', 'log2', None],
        'classifier__max_depth': [None, 5, 10],
    },

    # Support Vector Machine
    {
        'classifier': [SVC(random_state=42)],
        'classifier__C': [0.1, 1.0, 10.0, 100],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto'],
    },
]

# Set up GridSearchCV
grid_search = GridSearchCV(full_pipeline, param_grid, cv=10, scoring='accuracy', n_jobs=-1)

In [160]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

grid_search.fit(X_train, y_train)

# Evaluate the results
print("Best parameters:")
print(grid_search.best_params_)
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Best model
best_model = grid_search.best_estimator_


# Convert cv_results_ to a pandas DataFrame for easier manipulation
results_df = pd.DataFrame(grid_search.cv_results_)

# Extract the classifier names from the 'param_classifier' parameter
results_df['classifier_name'] = results_df['param_classifier'].apply(lambda clf: clf.__class__.__name__)

# Group the results by classifier name and find the best mean test score for each
best_scores = results_df.groupby('classifier_name')['mean_test_score'].max()

# Print the best cross-validation accuracy for each classifier
print("Best cross-validation accuracy for each classifier:")
for classifier, score in best_scores.items():
    print(f"{classifier}: {score:.4f}")

Best parameters:
{'classifier': RandomForestClassifier(max_depth=10, max_features=10, n_estimators=500,
                       random_state=42), 'classifier__max_depth': 10, 'classifier__max_features': 10, 'classifier__n_estimators': 500}
Best cross-validation accuracy: 0.8385
Best cross-validation accuracy for each classifier:
DecisionTreeClassifier: 0.8093
LogisticRegression: 0.8002
RandomForestClassifier: 0.8385
SVC: 0.8294


Without feature enginering: (Best parameters:
{'classifier': RandomForestClassifier(max_depth=10, max_features=3, random_state=42), 'classifier__max_depth': 10, 'classifier__max_features': 3, 'classifier__n_estimators': 100}
Best cross-validation accuracy: 0.8350
Best cross-validation accuracy for each classifier:
DecisionTreeClassifier: 0.8048
LogisticRegression: 0.7991
RandomForestClassifier: 0.8350
SVC: 0.8261)

In [161]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   PassengerId       891 non-null    int64   
 1   Survived          891 non-null    int64   
 2   Pclass            891 non-null    int64   
 3   Name              891 non-null    object  
 4   Sex               891 non-null    object  
 5   Age               714 non-null    float64 
 6   SibSp             891 non-null    int64   
 7   Parch             891 non-null    int64   
 8   Ticket            891 non-null    object  
 9   Fare              891 non-null    float64 
 10  Cabin             204 non-null    object  
 11  Embarked          889 non-null    object  
 12  RelativesOnboard  891 non-null    int64   
 13  AgeGroup          714 non-null    category
dtypes: category(1), float64(2), int64(6), object(5)
memory usage: 91.6+ KB


In [162]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   PassengerId       418 non-null    int64   
 1   Pclass            418 non-null    int64   
 2   Name              418 non-null    object  
 3   Sex               418 non-null    object  
 4   Age               332 non-null    float64 
 5   SibSp             418 non-null    int64   
 6   Parch             418 non-null    int64   
 7   Ticket            418 non-null    object  
 8   Fare              417 non-null    float64 
 9   Cabin             91 non-null     object  
 10  Embarked          418 non-null    object  
 11  RelativesOnboard  418 non-null    int64   
 12  AgeGroup          332 non-null    category
dtypes: category(1), float64(2), int64(5), object(5)
memory usage: 39.9+ KB


In [164]:
predictions = best_model.predict(test)

output = pd.DataFrame({
    'PassengerId': test.index,
    'Survived': predictions.astype(int)
})
output.to_csv('submission.csv', index=False)