In [294]:
# IMPORTING THE DEPENDENCIES

In [292]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [232]:
# DATA COLLECTION AND PREPROCESSING

In [234]:

train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

In [236]:
train_df.shape

(891, 12)

In [238]:
test_df.shape

(418, 11)

In [240]:
train_miss_value = train_df.isnull().sum()
test_miss_value = test_df.isnull().sum()




In [242]:
print(train_miss_value)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [244]:
print(test_miss_value)

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [246]:
# handling missing value on train data

# Fill missing values in 'Age' with the median
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

# Create a binary feature for 'Cabin' missing values
train_df['HasCabin'] = train_df['Cabin'].notnull().astype(int)

# Fill missing values in 'Embarked' with the most frequent value
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)


In [248]:
print(train_df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
HasCabin         0
dtype: int64


In [250]:
train_df.drop('Cabin', axis=1, inplace=True)

In [252]:
print(train_df.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
HasCabin       0
dtype: int64


In [254]:
train_df.head

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

In [256]:
print(train_df[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age']].head())

   PassengerId  Survived  Pclass     Sex   Age
0            1         0       3    male  22.0
1            2         1       1  female  38.0
2            3         1       3  female  26.0
3            4         1       1  female  35.0
4            5         0       3    male  35.0


In [258]:
# HANDLING MISSING VALUE ON TEST DATA

# Fill missing values in 'Age' and 'Fare'
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Create a binary feature for 'Cabin'
test_df['HasCabin'] = test_df['Cabin'].notnull().astype(int)

# Drop the original 'Cabin' column
# test_data.drop('Cabin', axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)


In [260]:
test_df.drop('Cabin', axis=1, inplace=True)

In [262]:
print(test_df.isnull().sum())

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
HasCabin       0
dtype: int64


In [264]:
# Note, male = 0 , female = 1

train_df.loc[train_df['Sex'] == 'male', 'Sex',] = 0
train_df.loc[train_df['Sex'] == 'female', 'Sex',] = 1

X = train_df.drop(['PassengerId', 'Survived', 'Name', 'Embarked', 'Ticket'], axis=1)
Y = train_df['Survived']

In [266]:
test_df.loc[test_df['Sex'] == 'male', 'Sex',] = 0
test_df.loc[test_df['Sex'] == 'female', 'Sex',] = 1

X_test = test_df.drop(['PassengerId', 'Name', 'Embarked', 'Ticket'], axis=1)


In [268]:
print(Y)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [270]:
print(X)

     Pclass Sex   Age  SibSp  Parch     Fare  HasCabin
0         3   0  22.0      1      0   7.2500         0
1         1   1  38.0      1      0  71.2833         1
2         3   1  26.0      0      0   7.9250         0
3         1   1  35.0      1      0  53.1000         1
4         3   0  35.0      0      0   8.0500         0
..      ...  ..   ...    ...    ...      ...       ...
886       2   0  27.0      0      0  13.0000         0
887       1   1  19.0      0      0  30.0000         1
888       3   1  28.0      1      2  23.4500         0
889       1   0  26.0      0      0  30.0000         1
890       3   0  32.0      0      0   7.7500         0

[891 rows x 7 columns]


In [272]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,HasCabin
0,3,0,34.5,0,0,7.8292,0
1,3,1,47.0,1,0,7.0000,0
2,2,0,62.0,0,0,9.6875,0
3,3,0,27.0,0,0,8.6625,0
4,3,1,22.0,1,1,12.2875,0
...,...,...,...,...,...,...,...
413,3,0,27.0,0,0,8.0500,0
414,1,1,39.0,0,0,108.9000,1
415,3,0,38.5,0,0,7.2500,0
416,3,0,27.0,0,0,8.0500,0


In [278]:
# SPLITTING DATA INTO TEST AND TRAIN

In [280]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)


In [282]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)

In [284]:
# MODEL EVALUATION 

In [286]:
Y_pred = model.predict(X_val)
print('Validation Accuracy:', accuracy_score(Y_val, Y_pred))

Validation Accuracy: 0.7932960893854749


In [288]:
X_test.head

<bound method NDFrame.head of      Pclass Sex   Age  SibSp  Parch      Fare  HasCabin
0         3   0  34.5      0      0    7.8292         0
1         3   1  47.0      1      0    7.0000         0
2         2   0  62.0      0      0    9.6875         0
3         3   0  27.0      0      0    8.6625         0
4         3   1  22.0      1      1   12.2875         0
..      ...  ..   ...    ...    ...       ...       ...
413       3   0  27.0      0      0    8.0500         0
414       1   1  39.0      0      0  108.9000         1
415       3   0  38.5      0      0    7.2500         0
416       3   0  27.0      0      0    8.0500         0
417       3   0  27.0      1      1   22.3583         0

[418 rows x 7 columns]>

In [290]:
# BUILDING PREDICTIVE SYSTEM

In [218]:
test_predictions = model.predict(X_test)

In [220]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})
submission.to_csv('submission.csv', index=False)
