# Data Ingestion


In [73]:
import numpy as np
import pandas as pd
import plotly.express as px
import sklearn
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [74]:
data = pd.read_csv("C:\Certifications\Codsoft Intern\Task 1/tested.csv")

# Data Processing and Cleaning


In [75]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [76]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [77]:
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,0,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [78]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [79]:
data.shape

(418, 12)

In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


# Handling Null values

## Age

In [81]:
mean_male = data.loc[data['Sex'] == 'male', 'Age'].mean()
mean_male

30.27273170731707

In [82]:
mean_female = data.loc[data["Sex"]=='female' , "Age"].mean()
mean_female

30.27236220472441

In [83]:
data['Age'].fillna(30.272, inplace=True) # males and females had same mean age so i filled the null values with mean

In [84]:
print(data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [87]:
data['Fare'].fillna(30.272, inplace=True)

In [88]:
print(data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


# Columns to Consider Dropping

In [89]:
data["Sex"]=data['Sex'].map({'male': 0, 'female': 1})

In [90]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S


In [91]:
correlation_matrix = data.corr()
correlation_matrix

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.023245,-0.026751,-0.023245,-0.030874,0.003818,0.04308,0.008317
Survived,-0.023245,1.0,-0.108615,1.0,-1e-05,0.099943,0.15912,0.191554
Pclass,-0.026751,-0.108615,1.0,-0.108615,-0.440788,0.001087,0.018721,-0.576813
Sex,-0.023245,1.0,-0.108615,1.0,-1e-05,0.099943,0.15912,0.191554
Age,-0.030874,-1e-05,-0.440788,-1e-05,1.0,-0.079533,-0.045073,0.326249
SibSp,0.003818,0.099943,0.001087,0.099943,-0.079533,1.0,0.306895,0.171601
Parch,0.04308,0.15912,0.018721,0.15912,-0.045073,0.306895,1.0,0.230091
Fare,0.008317,0.191554,-0.576813,0.191554,0.326249,0.171601,0.230091,1.0


In [92]:
fig = px.imshow(correlation_matrix, color_continuous_scale='RdBu_r')
fig.show()

In [93]:
data.drop('Cabin', axis=1, inplace=True) # too many missing values


In [94]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,0,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,S
2,894,0,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,Q
3,895,0,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,S


# Model Building

In [95]:
X = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Model Evaluation

In [96]:
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:\n', confusion)
print('Classification Report:\n', classification)

Accuracy: 1.00
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In conclusion, the model performs exceptionally well, properly predicting both classes with a perfect accuracy of 1.00 and no false positives or false negatives. This may indicate that the model is exhibiting very high performance on your test set. It's crucial to make sure the model hasn't overfit the training set and that this performance level is constant. For a more comprehensive evaluation of the model's generalization performance, you might want to think about cross-validation and testing on a bigger and more varied dataset.

# Mode Testing 

## Testing for male with same values

In [100]:
my_passenger = pd.DataFrame({
    'Pclass': [1],        
    'Sex': ['0'],      # male == 0 and female == 1
    'Age': [35],          
    'SibSp': [1],       
    'Parch': [0],        
    'Fare': [50.0]        
})

prediction = model.predict(my_passenger)

if prediction[0] == 1:
    print("The passenger has survived.")
else:
    print("The passenger died.")

The passenger died.


## Testing for Female with same values

In [101]:
my_passenger = pd.DataFrame({
    'Pclass': [1],        
    'Sex': ['1'],      # male == 0 and female == 1
    'Age': [35],          
    'SibSp': [1],       
    'Parch': [0],        
    'Fare': [50.0]        
})

prediction = model.predict(my_passenger)

if prediction[0] == 1:
    print("The passenger has survived.")
else:
    print("The passenger died.")

The passenger has survived.
