In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
X_train = pd.read_csv("dataset/train.csv")
X_test = pd.read_csv("dataset/test.csv")

In [14]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [15]:
Id = X_test.iloc[:, 0]

y_train = X_train['Survived']

In [16]:
X_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [17]:
X_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [18]:
X_train = X_train.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket'])
X_test = X_test.drop(columns=['PassengerId', 'Name', 'Ticket'])

In [19]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [20]:
X_test.isnull().sum()

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [21]:
letter_mapping = {'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G'}

def map_cabin_value(cabin):
    if pd.notna(cabin):
        for letter, value in letter_mapping.items():
            if letter in cabin:
                return value
    return 0
X_train['Cabin'] = X_train['Cabin'].apply(map_cabin_value)
X_test['Cabin'] = X_test['Cabin'].apply(map_cabin_value)

In [22]:
X_train['Embarked'] = X_train['Embarked'].fillna(X_train['Embarked'].mode()[0])
X_test['Fare'] = X_test['Fare'].fillna(X_train['Fare'].mode()[0])

X_train['Age'] = X_train['Age'].fillna(X_train['Age'].mean())
X_test['Age'] = X_test['Age'].fillna(X_train['Age'].mean())

In [23]:
X_train.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [24]:
X_test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Cabin     891 non-null    object 
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [26]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Cabin     418 non-null    object 
 7   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 26.3+ KB


In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

X_train[['Sex', 'Embarked', 'Cabin']] = X_train[['Sex', 'Embarked', 'Cabin']].astype(str)
X_test[['Sex', 'Embarked', 'Cabin']] = X_test[['Sex', 'Embarked', 'Cabin']].astype(str)

ct = ColumnTransformer(
    transformers=[('encoder', encoder, ['Sex', 'Embarked', 'Cabin'])],
    remainder='passthrough'
)

encoded_data = ct.fit_transform(X_train)

encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['Sex', 'Embarked', 'Cabin'])

remaining_columns = [col for col in X_train.columns if col not in ['Sex', 'Embarked', 'Cabin']]

final_column_names = list(encoded_column_names) + remaining_columns

encoded_dataset = pd.DataFrame(encoded_data, columns=final_column_names)

encoded_dataset.columns

X_train = encoded_dataset

In [28]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex_male    891 non-null    float64
 1   Embarked_Q  891 non-null    float64
 2   Embarked_S  891 non-null    float64
 3   Cabin_A     891 non-null    float64
 4   Cabin_B     891 non-null    float64
 5   Cabin_C     891 non-null    float64
 6   Cabin_D     891 non-null    float64
 7   Cabin_E     891 non-null    float64
 8   Cabin_F     891 non-null    float64
 9   Cabin_G     891 non-null    float64
 10  Pclass      891 non-null    float64
 11  Age         891 non-null    float64
 12  SibSp       891 non-null    float64
 13  Parch       891 non-null    float64
 14  Fare        891 non-null    float64
dtypes: float64(15)
memory usage: 104.5 KB


In [29]:
encoded_data = ct.transform(X_test)

encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['Sex', 'Embarked', 'Cabin'])

remaining_columns = [col for col in X_test.columns if col not in ['Sex', 'Embarked', 'Cabin']]

final_column_names = list(encoded_column_names) + remaining_columns

encoded_dataset = pd.DataFrame(encoded_data, columns=final_column_names)

encoded_dataset.columns

X_test = encoded_dataset

In [30]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Sex_male    418 non-null    float64
 1   Embarked_Q  418 non-null    float64
 2   Embarked_S  418 non-null    float64
 3   Cabin_A     418 non-null    float64
 4   Cabin_B     418 non-null    float64
 5   Cabin_C     418 non-null    float64
 6   Cabin_D     418 non-null    float64
 7   Cabin_E     418 non-null    float64
 8   Cabin_F     418 non-null    float64
 9   Cabin_G     418 non-null    float64
 10  Pclass      418 non-null    float64
 11  Age         418 non-null    float64
 12  SibSp       418 non-null    float64
 13  Parch       418 non-null    float64
 14  Fare        418 non-null    float64
dtypes: float64(15)
memory usage: 49.1 KB


In [31]:
X_train.describe()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.647587,0.08642,0.725028,0.016835,0.05275,0.066218,0.037037,0.037037,0.013468,0.004489,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.47799,0.281141,0.446751,0.128725,0.223659,0.248802,0.188959,0.188959,0.115332,0.06689,0.836071,13.002015,1.102743,0.806057,49.693429
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,22.0,0.0,0.0,7.9104
50%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,35.0,1.0,0.0,31.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [32]:
X_test.describe()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,0.636364,0.110048,0.645933,0.016746,0.043062,0.083732,0.0311,0.026316,0.014354,0.002392,2.26555,30.154603,0.447368,0.392344,35.561214
std,0.481622,0.313324,0.478803,0.128474,0.20324,0.277317,0.173797,0.160265,0.119088,0.048912,0.841838,12.636666,0.89676,0.981429,55.856789
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,23.0,0.0,0.0,7.8958
50%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,35.75,1.0,0.0,31.471875
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [33]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Random Forest

In [34]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

In [None]:
# Save predictions to submission format
submission = pd.DataFrame({
    'PassengerId': Id,
    'Survived': y_pred
    })

submission.to_csv('../submission/xgboost3.csv', index=False)