# Import necessary libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [2]:
df = pd.read_csv('titanic.csv')

# Data Preprocessing
# Fill missing age values with the median age

In [3]:
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing embarked values with the mode

In [4]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column

In [5]:
df.drop(columns=['Cabin'], inplace=True)

# Verify missing values are handled

In [6]:
print(df.isnull().sum())

# Encoding Categorical Variables
# One-hot encode 'Sex' and 'Embarked' columns

In [7]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
print(df.head())

# Feature Scaling
# Initialize the scaler

In [8]:
scaler = StandardScaler()

# Scale 'Age' and 'Fare'

In [9]:
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
print(df[['Age', 'Fare']].head())

# Model Building
# Define feature matrix and target vector

In [10]:
X = df.drop(columns=['PassengerId', 'Name','Lname', 'Ticket', 'Survived'])
y = df['Survived']

# Split data into training and testing sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model

In [12]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions

In [13]:
y_pred = model.predict(X_test)

# Evaluate the model

In [14]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [15]:
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

# Import necessary libraries for visualization

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score

# Plot Confusion Matrix

In [17]:
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Survived', 'Survived'], yticklabels=['Not Survived', 'Survived'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC Curve

In [18]:
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

In [19]:
plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Feature Importance

In [20]:
feature_importance = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_[0]})
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)

In [21]:
plt.figure(figsize=(10, 7))
sns.barplot(x='Coefficient', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.show()