In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv(r"Kaggle\titanic\train.csv")
testing_data = pd.read_csv(r"Kaggle\titanic\test.csv")

if 'Survived' not in testing_data.columns:
    testing_data['Survived'] = 0

In [3]:
df.isna().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
def Preprocess(df_test, df_train=pd.read_csv(r"Kaggle\titanic\train.csv")):    
    df = pd.concat([df_train, df_test], axis=0)
    df = df.drop('Name', axis = 1)
    df = df.drop('Ticket', axis = 1)
    df['Cabin'] = df['Cabin'].fillna('X000')
    df['Embarked'] = df['Embarked'].fillna('X')
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # Apply the split_cabin_multiple function to each row
    df['cabin_letter'] = df['Cabin'].str.extract(r'([a-zA-Z]+)', expand=False)
    df['cabin_number'] = df['Cabin'].str.extract(r'(\d+)', expand=False)

    # One-hot encode the "cabin_letter" column
    df_encoded = pd.get_dummies(df, columns=['cabin_letter'], prefix='cabin')
    df_encoded = pd.get_dummies(df_encoded, columns=['Embarked'], prefix='Embarked')
    df_encoded = pd.get_dummies(df_encoded, columns=['Sex'])
    # Drop the original "cabin" column
    df_encoded = df_encoded.drop(columns=['Cabin'])
    df_encoded = df_encoded.drop(columns=['cabin_X'])
    df_encoded = df_encoded.drop(columns=['Embarked_X'])
    df = df_encoded
    df['cabin_number'] = df['cabin_number'].fillna(0)
    df['cabin_number'] = pd.to_numeric(df['cabin_number'])

    #Feature Engineering
    df['Pclass_bin_Fare'] = df['Fare'] // df['Pclass'] 
    df['Pclass_bin_Sex'] =  df['Pclass'] - df['Sex_female']

    df_train = df[:len(df_train)]
    df_test = df[len(df_train):]

    df_test = df_test.drop(columns=['Survived'])

    return df_train, df_test

In [6]:
df, testing_data = Preprocess(testing_data)


In [7]:
df.isna().sum()


PassengerId        0
Survived           0
Pclass             0
Age                0
SibSp              0
Parch              0
Fare               0
cabin_number       0
cabin_A            0
cabin_B            0
cabin_C            0
cabin_D            0
cabin_E            0
cabin_F            0
cabin_G            0
cabin_T            0
Embarked_C         0
Embarked_Q         0
Embarked_S         0
Sex_female         0
Sex_male           0
Pclass_bin_Fare    0
Pclass_bin_Sex     0
dtype: int64

In [8]:
df.corr()['Survived']


PassengerId       -0.005007
Survived           1.000000
Pclass            -0.338481
Age               -0.070323
SibSp             -0.035322
Parch              0.081629
Fare               0.257307
cabin_number       0.229756
cabin_A            0.022287
cabin_B            0.175095
cabin_C            0.114652
cabin_D            0.150716
cabin_E            0.145321
cabin_F            0.057935
cabin_G            0.016040
cabin_T           -0.026456
Embarked_C         0.168240
Embarked_Q         0.003650
Embarked_S        -0.155660
Sex_female         0.543351
Sex_male          -0.543351
Pclass_bin_Fare    0.267823
Pclass_bin_Sex    -0.533994
Name: Survived, dtype: float64

In [10]:
X = df.drop(columns=['Survived'])
y = df['Survived']
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, y, test_size=0.2)

Train_Y = np.reshape(Train_Y.to_numpy(), (-1, 1))
Test_Y = np.reshape(Test_Y.to_numpy(), (-1, 1))

Train_X.shape, Train_Y.shape


((712, 22), (712, 1))

In [12]:
model = LogisticRegression()
model.fit(Train_X, Train_Y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
model_2 = XGBClassifier(enable_categorical = True)
model_2.fit(Train_X, Train_Y)

In [14]:
model_3 = DecisionTreeClassifier()
model_3.fit(Train_X, Train_Y)

In [15]:
model_4 = RandomForestClassifier()
model_4.fit(Train_X, Train_Y)

  model_4.fit(Train_X, Train_Y)


In [16]:
accuracy_score(Test_Y, model.predict(Test_X))

0.7821229050279329

In [17]:
accuracy_score(Test_Y, model_2.predict(Test_X))


0.7821229050279329

In [18]:
accuracy_score(Test_Y, model_3.predict(Test_X))


0.6871508379888268

In [19]:
accuracy_score(Test_Y, model_4.predict(Test_X))


0.8044692737430168

In [20]:
pred = model_4.predict(testing_data)

final = pd.DataFrame()
final['PassengerId'] = testing_data['PassengerId']
final['Survived'] = pred

# Write DataFrame to a CSV file without index
final.to_csv('output.csv', index=False)