In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

In [15]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/Ladder(EDA).csv")
df.drop(["PassengerId","Cabin"],axis=1,inplace=True)
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [17]:
df.Survived.value_counts()*100/891

Unnamed: 0_level_0,count
Survived,Unnamed: 1_level_1
0,61.616162
1,38.383838


In [18]:
numeric_features = df.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_features = df.select_dtypes(include=["object"]).columns.tolist()
numeric_features.remove("Survived")

In [19]:
X = df.drop("Survived",axis=1)
y = df["Survived"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [20]:
numeric_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",StandardScaler()),
])
categorical_transformer = Pipeline(steps=[
    ("onehot",OneHotEncoder(handle_unknown="ignore"))
])
preprocessing = ColumnTransformer(transformers=([
    ("num",numeric_transformer,numeric_features),
    ("cat",categorical_transformer,categorical_features)
]))

In [21]:
models = ["LogisticRegression","RandomForestClassifier","KNeighborsClassifier","SVC","DecisionTreeClassifier"]

for i in models:
  if i == "LogisticRegression":
    model = Pipeline([
        ("preprocessing",preprocessing),
        ("model",LogisticRegression())
    ])
  elif i == "RandomForestClassifier":
    model = Pipeline([
        ("preprocessing",preprocessing),
        ("model",RandomForestClassifier())
    ])
  elif i == "KNeighborsClassifier":
    model = Pipeline([
        ("preprocessing",preprocessing),
        ("model",KNeighborsClassifier())
    ])
  elif i == "SVC":
    model = Pipeline([
        ("preprocessing",preprocessing),
        ("model",SVC())
    ])

  elif i == "DecisionTreeClassifier":
    model = Pipeline([
        ("preprocessing",preprocessing),
        ("model",DecisionTreeClassifier())
    ])

  model.fit(X_train,y_train)
  y_pred = model.predict(X_test)
  print(f"Model: {i}")
  print(f"Accuracy: {accuracy_score(y_test,y_pred)}")
  print(f"Confusion Matrix: \n{confusion_matrix(y_test,y_pred)}")
  print(f"Classification Report: \n{classification_report(y_test,y_pred)}")
  print("-"*50)

Model: LogisticRegression
Accuracy: 0.8156424581005587
Confusion Matrix: 
[[91 14]
 [19 55]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.81      0.82      0.81       179

--------------------------------------------------
Model: RandomForestClassifier
Accuracy: 0.8100558659217877
Confusion Matrix: 
[[93 12]
 [22 52]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       105
           1       0.81      0.70      0.75        74

    accuracy                           0.81       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179

--------------------------------------------------
Model:

In [22]:
# After Hyper parameter optimization of KNeighborsClaffifier
model = Pipeline([
    ("preprocessing",preprocessing),
    ("model",KNeighborsClassifier(metric='manhattan', n_neighbors= 11, p= 1, weights= 'distance'))
])

model.fit(X_train,y_train)
y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test,y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test,y_pred)}")
print(f"Classification Report: \n{classification_report(y_test,y_pred)}")

Accuracy: 0.8379888268156425
Confusion Matrix: 
[[95 10]
 [19 55]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.83      0.90      0.87       105
           1       0.85      0.74      0.79        74

    accuracy                           0.84       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.84      0.84      0.84       179

