In [None]:
import pandas as pd
import seaborn as sns

titanic_data = sns.load_dataset("titanic")
titanic_data.head()

In [None]:

!pip install sweetviz
import sweetviz as sv

titanic_report = sv.analyze(titanic_data)
titanic_report.show_html("titanic_sweetviz_report.html")

In [None]:
from IPython.display import IFrame
titanic_report.show_html("titanic_sweetviz_report.html")

IFrame(src='titanic_sweetviz_report.html', width=1000, height=600)

In [None]:
titanic_data.describe()

In [None]:
titanic_data.isnull().sum()

In [None]:
import pandas as pd

age_mean = titanic_data["age"].mean()

titanic_data["age"].fillna(age_mean, inplace=True)

titanic_data["age"].isnull().sum()


In [None]:
titanic_data.drop(["deck"],inplace = True,axis = 1)

In [None]:
titanic_data.columns

In [None]:
titanic_data.dropna(inplace = True,axis =1 )

In [None]:
titanic_data.isna().sum()

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False)
new_data = onehot_encoder.fit_transform(titanic_data[["sex"]])
feature_names = onehot_encoder.get_feature_names_out(["sex"])
new_df = pd.DataFrame(new_data, columns=feature_names)

titanic_data = pd.concat([titanic_data.drop(columns=["sex"]), new_df], axis=1)

print(titanic_data.head())


In [None]:
titanic_data.drop(["alive", "class"], axis=1, inplace=True)



In [None]:
sns.countplot(x = "survived",hue= "adult_male",data = titanic_data)

In [None]:
sns.countplot(x = "survived",hue= "alone",data = titanic_data)

In [None]:
sns.countplot(x = "survived",hue= "pclass",data = titanic_data)

In [None]:
titanic_data.columns

## Training for the Logistic Regression model

In [None]:
Logistic_data = titanic_data[["survived","pclass","age","sex_male"]]
Logistic_data

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg_cls = LogisticRegression()


In [None]:
import matplotlib.pyplot as plt
correlation = Logistic_data.corr()


plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [None]:
X = Logistic_data[['pclass', 'age', 'sex_male']]
y = Logistic_data['survived']


In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X,y ,test_size = 0.25,random_state = 40)
x_train

In [None]:
log_reg_cls.fit(x_train,y_train)

In [None]:
y_pred = log_reg_cls.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_pred,y_test))

In [None]:
y_train_pred = log_reg_cls.predict(x_test)
print(classification_report(y_train_pred,y_test))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(log_reg_cls, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)



In [None]:

# Get the best model
best_model = grid_search.best_estimator_

# Predict and evaluate the best model
y_pred = best_model.predict(x_test)
cls_rep = classification_report(y_test, y_pred)

In [None]:
print(f"Best Hyperparameters: {grid_search.best_params_}")


In [None]:
print(f"classification: {cls_rep}")

## Naive Bayes model

In [None]:
titanic_data

In [None]:
naive_data = titanic_data[["survived","pclass","age","sex_male"]]

In [None]:
correlation = naive_data.corr()
correlation

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB

X = naive_data.drop("survived",axis = 1)
y = naive_data["survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model
gnb.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = gnb.predict(X_test)

cls_rep = classification_report(y_test,y_pred)
print(cls_rep)

## KNN

In [None]:
knn_data = titanic_data[["survived","pclass","age","sex_male"]]


In [None]:
from sklearn.neighbors import KNeighborsClassifier
X = knn_data.drop("survived",axis = 1)
y = knn_data["survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

# Train the model
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

In [None]:
cls_rep = classification_report(y_test,y_pred)
print(cls_rep)

In [None]:
param_grid = {
    'n_neighbors': [ 2, 3 ]
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_
best_model


In [None]:
y_pred = best_model.predict(X_test)
cls_rep = classification_report(y_test,y_pred)
print(cls_rep)


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_data = titanic_data
X = decision_data.drop(['survived',"who"], axis=1)
y = decision_data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
param_grid = {
    'max_depth': [None,3,5,8,11],
    'min_samples_split': [5,8, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=8, scoring='accuracy')


In [None]:
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


In [None]:
# Predict on the test data
y_pred = best_model.predict(X_test)

# Evaluate the best model
print(f"Best Hyperparameters: {grid_search.best_params_}")



In [None]:
print(classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Plot the decision tree
plt.figure(figsize=(15, 10))
plot_tree(best_model, filled=True, feature_names=X.columns, class_names=['Not Survived', 'Survived'])
plt.title("Decision Tree Visualization")
plt.show()


## Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

svm_data = titanic_data

X = svm_data.drop(['survived',"who"], axis=1)
y = svm_data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm_classifier = SVC(kernel='poly', random_state=42)
svm_classifier.fit(X_train_scaled, y_train)
y_pred = svm_classifier.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

In [None]:
param_grid = {
    'C': [1, 10, 100],
    'kernel': ['poly',"sigmoid"],
    'degree': [2, 3, 4]
}
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
X = decision_data.drop('survived', axis=1)
y = decision_data['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=7, scoring='accuracy')



In [None]:
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_


In [None]:
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))