In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = sns.load_dataset('titanic')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [5]:
df.drop('alive', axis=1, inplace=True)

In [11]:
def assign_deck(row):
    if row["who"] == "woman" and row["survived"] == 1:
        if row["pclass"] == 1:
            return "A"
        elif row["pclass"] == 2:
            return "D"
        elif row["pclass"] == 3:
            return "F"
    elif row["who"] == "woman" and row["survived"] == 0:
        if row["pclass"] == 1:
            return "C"
        elif row["pclass"] == 2:
            return "E"
        elif row["pclass"] == 3:
            return "G"
    return np.nan  # Default value if no condition is met


df["deck"] = df["deck"].fillna(df.apply(assign_deck, axis=1))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         383 non-null    category
 12  embark_town  889 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(4)
memory usage: 73.7+ KB


In [13]:
df = df[~df['age'].isnull()]

In [15]:
df['embarked'] = df['embarked'].fillna('S')
df['embark_town'] = df['embark_town'].fillna('Southampton')

In [17]:
def assign_deck_for_men(row):
    if row["who"] == "man":  # Only apply for men
        if row["survived"] == 1:
            if row["pclass"] == 1:
                # pclass 1 survived, Decks C,B,E,A,D
                return np.random.choice(["C", "B", "E", "A", "D"], p=[0.28, 0.22, 0.18, 0.17, 0.15])  
            elif row["pclass"] == 2:
                # pclass 2 survived, Deck D
                return "D"
            elif row["pclass"] == 3:
                # pclass 3 survived, Deck E
                return "E"
        elif row["survived"] == 0:
            if row["pclass"] == 1:
                # pclass 1 not survived, Decks C,B,A,D,E
                return np.random.choice(["C", "B", "A", "D", "E"], p=[0.28, 0.26, 0.18, 0.16, 0.12])  
            elif row["pclass"] == 2:
                # pclass 2 not survived, Decks D,F
                return np.random.choice(["D", "F"], p=[0.5, 0.5])  # Equal probability
            elif row["pclass"] == 3:
                # pclass 3 not survived, Decks E,F
                return np.random.choice(["E", "F"], p=[0.2, 0.8])  # Based on count
    return np.nan  # Default if it's not a man or doesn't meet conditions


df["deck"] = df["deck"].fillna(df.apply(assign_deck_for_men, axis=1))
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     714 non-null    int64   
 1   pclass       714 non-null    int64   
 2   sex          714 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        714 non-null    int64   
 5   parch        714 non-null    int64   
 6   fare         714 non-null    float64 
 7   embarked     714 non-null    object  
 8   class        714 non-null    category
 9   who          714 non-null    object  
 10  adult_male   714 non-null    bool    
 11  deck         644 non-null    category
 12  embark_town  714 non-null    object  
 13  alone        714 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(4)
memory usage: 64.6+ KB


In [19]:
def assign_deck(row):
    if row["who"] == "child" and row["survived"] == 1:
        if row["pclass"] == 1:
            return "B"
        elif row["pclass"] == 2:
            return "F"
        elif row["pclass"] == 3:
            return "E"
    elif row["who"] == "child" and row["survived"] == 0:
        if row["pclass"] == 1:
            return "C"
        elif row["pclass"] == 3:
            return "G"
    return np.nan  # Default value if no condition is met

df["deck"] = df["deck"].fillna(df.apply(assign_deck, axis=1))
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 714 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     714 non-null    int64   
 1   pclass       714 non-null    int64   
 2   sex          714 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        714 non-null    int64   
 5   parch        714 non-null    int64   
 6   fare         714 non-null    float64 
 7   embarked     714 non-null    object  
 8   class        714 non-null    category
 9   who          714 non-null    object  
 10  adult_male   714 non-null    bool    
 11  deck         714 non-null    category
 12  embark_town  714 non-null    object  
 13  alone        714 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(4)
memory usage: 64.6+ KB


In [21]:
df_new = df
df_new = df.drop(['embark_town','class'], axis=1)
df_new['sex'] = df_new['sex'].map({'male': 0, 'female': 1})
df_new = pd.get_dummies(df_new, columns=['who', 'embarked'], drop_first=True)
df_new['alone'] = df_new['alone'].map({True: 0, False: 1})
df_new['adult_male'] = df_new['adult_male'].map({True: 0, False: 1})
df_new.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,deck,alone,who_man,who_woman,embarked_Q,embarked_S
0,0,3,0,22.0,1,0,7.25,0,F,1,True,False,False,True
1,1,1,1,38.0,1,0,71.2833,1,C,1,False,True,False,False
2,1,3,1,26.0,0,0,7.925,1,F,0,False,True,False,True
3,1,1,1,35.0,1,0,53.1,1,C,1,False,True,False,True
4,0,3,0,35.0,0,0,8.05,0,E,0,True,False,False,True
6,0,1,0,54.0,0,0,51.8625,0,E,0,True,False,False,True
7,0,3,0,2.0,3,1,21.075,1,G,1,False,False,False,True
8,1,3,1,27.0,0,2,11.1333,1,F,1,False,True,False,True
9,1,2,1,14.0,1,0,30.0708,1,F,1,False,False,False,False
10,1,3,1,4.0,1,1,16.7,1,G,1,False,False,False,True


In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [29]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_deck = encoder.fit_transform(x_train[['deck']])
x_train = x_train.drop(columns=['deck'])
x_train = np.hstack((x_train, encoded_deck))  # Add the encoded features

encoded_deck_test = encoder.transform(x_test[['deck']])
x_test = x_test.drop(columns=['deck'])
x_test = np.hstack((x_test, encoded_deck_test))  # Add the encoded features


In [27]:
X=df_new.drop('survived',axis=1)
y=df_new['survived']
x_train, x_test , y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [35]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
trees= RandomForestClassifier()
params= { 'n_estimators':[10,20,100,200,300,500] , 'max_depth':[None, 7,2,5,10],'min_samples_split':[2,3,4,5,10,20] }
grid = GridSearchCV(trees, param_grid=params, scoring='accuracy', cv=5)
grid.fit(x_train,y_train)
grid.best_score_

0.8966895499618612

In [36]:
grid.best_params_

{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}

# Yay Its 89% now