# Load the dataset

In [27]:
import pandas as pd
df=pd.read_csv("C:\\Users\\geeth\\Downloads\\Titanic-Dataset.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
df.shape

(891, 12)

# Preprocessing the data

In [29]:
#Checking for missing values
missing_values=df.isnull().sum()
missing_values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [30]:
#Encode categorical variables
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [31]:
df["Sex"]=le.fit_transform(df["Sex"])

In [32]:
print(df["Embarked"].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [33]:
#Handling the missing values in "Embarked" column
df["Embarked"]=df["Embarked"].fillna("S")

In [34]:
df["Embarked"]=le.fit_transform(df["Embarked"])

In [35]:
#Handling the missing values in "Age" column
#seperating the rows with and withot null values
age_missing=df[df["Age"].isnull()]
age_not_missing=df[df["Age"].notnull()]

In [36]:
#Calculate correlation matrix
correlation_matrix=df.corr()

#Select features with highet absolure correlation with "Age"
correlation_with_age=correlation_matrix["Age"].abs().sort_values(ascending=False)

#Excluding Age itself
selected_features=correlation_with_age[1:]

selected_features

  correlation_matrix=df.corr()


Pclass         0.369226
SibSp          0.308247
Parch          0.189119
Fare           0.096067
Sex            0.093254
Survived       0.077221
PassengerId    0.036847
Embarked       0.030394
Name: Age, dtype: float64

In [37]:
#Features used for predicting the missing values in "Age"
features=["Pclass","SibSp","Parch","Fare"]

In [38]:

#Train a model to predict Age
from sklearn.ensemble import RandomForestRegressor
age_model=RandomForestRegressor(n_estimators=100,random_state=42)
age_model.fit(age_not_missing[features],age_not_missing["Age"])

In [39]:
#Predicting missing ages
predicted_ages=age_model.predict(age_missing[features])

In [15]:
#Drp the column "Cabin"
df.drop("Cabin",axis=1,inplace=True)

In [16]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
df[["Age","Fare"]]=scaler.fit_transform(df[["Age","Fare"]])

# Display the preprocessed data

In [17]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,-0.562047,1,0,A/5 21171,-0.502445,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.613594,1,0,PC 17599,0.786845,0
2,3,1,3,"Heikkinen, Miss. Laina",0,-0.268137,0,0,STON/O2. 3101282,-0.488854,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.393162,1,0,113803,0.42073,2
4,5,0,3,"Allen, Mr. William Henry",1,0.393162,0,0,373450,-0.486337,2


In [18]:
df.shape

(891, 11)

In [19]:
#Checking for missing values after preprocessing the data
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

# Feature Selection

In [20]:
X=df.drop(["Survived","PassengerId","Name","Ticket"],axis=1)
y=df["Survived"]

# Split the data

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# Train the Random Forest Model

In [22]:
from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier(random_state=42)
random_forest.fit(X_train,y_train)

# Making the predictions

In [23]:
y_pred=random_forest.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1], dtype=int64)

# Evaluate the model

In [24]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
def evaluate(y_test,y_pred):
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    recall=recall_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    return accuracy,precision,recall,f1

# Evaluate the Random Forest Model

In [25]:
random_forest_metrics=evaluate(y_test,y_pred)

# Display the results

In [26]:
print("Random Forest Metrics:")
print("Accuracy:",random_forest_metrics[0])
print("Precison:",random_forest_metrics[1])
print("Recall:",random_forest_metrics[2])
print("F1-Score:",random_forest_metrics[3])

Random Forest Metrics:
Accuracy: 0.8268156424581006
Precison: 0.8028169014084507
Recall: 0.7702702702702703
F1-Score: 0.7862068965517242
