In [1]:
import pandas as pd 
import numpy as np

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print(train_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [3]:
print(train_data.describe())
print(train_data.info())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data colu

In [4]:
print(train_data.isnull().sum())
missing_value_columns = train_data.columns[train_data.isnull().sum() > 0]


print("Columns with missing values:")
for column in missing_value_columns:
    print(f"{column}: {train_data[column].isnull().sum()} missing values")


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Columns with missing values:
Age: 177 missing values
Cabin: 687 missing values
Embarked: 2 missing values


In [5]:
train_data.drop(columns=['Cabin'],inplace=True)

In [6]:
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)

non_missing_values = train_data['Embarked'] .dropna().unique()
nan_indices = train_data[train_data['Embarked'].isnull()].index
for idx in nan_indices:
    train_data.at[idx, 'Embarked'] = np.random.choice(non_missing_values)
print(train_data['Age'])
print(train_data['Embarked'])

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64
0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object


In [7]:
train_data['Sex'] = train_data['Sex'].map({'male' : 0, 'female' : 1 })
train_data['Embarked'] = train_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
print(train_data['Sex'])
print(train_data['Embarked'])

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: int64
0      0
1      1
2      0
3      0
4      0
      ..
886    0
887    0
888    0
889    1
890    2
Name: Embarked, Length: 891, dtype: int64


In [8]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train_data[features]
y = train_data['Survived']


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=100)

model = RandomForestClassifier(n_estimators=200, random_state=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')
#The advanced Titanic survival prediction model demonstrates strong performance with an accuracy of approximately 82%.
# The model shows a good balance between precision and recall, especially for non-survivors.


print("Classification Report:")
print(classification_report(y_val, y_pred))
#Precision: Out of all the passengers predicted to survive, 76% actually survived. Out of all the passengers predicted not to survive, 87% did not survive.
#Recall: The model correctly identified 74% of the passengers who survived and 88% of those who did not survive.
#F1-score: This score is the harmonic mean of precision and recall. It is 75% for survivors and 88% for non-survivors.
#Support: The number of actual instances in each class (105 did not survive, 74 survived).

print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_val, y_pred)
print(conf_matrix)
#onfusion Matrix:
#True Negatives (90): Passengers who did not survive and were correctly predicted as not survived.
#False Positives (14): Passengers who did not survive but were incorrectly predicted as survived.
#False Negatives (18): Passengers who survived but were incorrectly predicted as not survived.
#True Positives (57): Passengers who survived and were correctly predicted as survived.



Validation Accuracy: 0.8212290502793296
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       104
           1       0.80      0.76      0.78        75

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Confusion Matrix:
[[90 14]
 [18 57]]
