# Reading Dataset

In [141]:
import pandas as pd

In [142]:
data = pd.read_csv('/content/Titanic-Dataset.csv')

In [None]:
data

# Understanding the Dataset

In [None]:
# Dataset shape (rows,columns)
data.shape

In [None]:
# Summary statistics (numerical)
data.describe()

In [None]:
print(data.columns.tolist())

In [None]:
# Column names and data types
data.info()

In [None]:
# Check for duplicates
print(data.duplicated().sum())

In [None]:
# Unique values in each column
data.nunique()

# Data Cleaning

In [None]:
# Checking missing values
print(data.isnull().values.any())

In [None]:
# Total missing values
print(data.isnull().sum().sum())

In [None]:
# Columns with missing values
data.isnull().sum()

In [153]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Visualize a missing values heatmap
sns.heatmap(data.isnull())
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Fill missing Age with median
data['Age'].fillna(data['Age'].median(), inplace=True)

In [None]:
# Fill missing Embarked with mode
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

In [157]:
# Drop Cabin column due to high missing values
data.drop(columns=['Cabin'], inplace=True)

# Univariate Analysis

In [None]:
plt.subplot(2, 3, 1)
sns.countplot(data=data, x='Survived')     # 0 = Did not survive, 1 = Survived
plt.title('Survival Count')

In [None]:
plt.subplot(2, 3, 2)
sns.countplot(data=data, x='Pclass')
plt.title('Passenger Class Distribution')

In [None]:
plt.subplot(2, 3, 3)
sns.countplot(data=data, x='Sex')
plt.title('Gender Distribution')

In [None]:
plt.subplot(2, 3, 4)
sns.histplot(data['Age'], kde=True, bins=30)
plt.title('Age Distribution')

In [None]:
plt.subplot(2, 3, 5)
sns.histplot(data['Fare'], kde=True, bins=30)
plt.title('Fare Distribution')

In [None]:
plt.subplot(2, 3, 6)
sns.countplot(data=data, x='Embarked')
plt.title('Embarkation Port')

# Bivariate Analysis

In [None]:
plt.subplot(2, 3, 1)
sns.countplot(data=data, x='Sex', hue='Survived')
plt.title('Survival by Gender')

In [None]:
plt.subplot(2, 3, 2)
sns.countplot(data=data, x='Pclass', hue='Survived')
plt.title('Survival by Class')

In [None]:
plt.subplot(2, 3, 3)
sns.countplot(data=data, x='Embarked', hue='Survived')
plt.title('Survival by Embarkation Port')

In [None]:
plt.subplot(2, 3, 4)
sns.boxplot(data=data, x='Survived', y='Age')
plt.title('Age Distribution by Survival')

In [None]:
plt.subplot(2, 3, 5)
sns.boxplot(data=data, x='Survived', y='Fare')
plt.title('Fare Distribution by Survival')

In [None]:
corr = data[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].corr()
sns.heatmap(corr, annot=True, cmap='YlGnBu')
plt.title('Correlation Heatmap')

# Data Preprocessing

In [170]:
# Encoding categorical variables (One-Hot Encoding)
encode = pd.get_dummies(data, drop_first=False)  # keep all categories

In [171]:
## Assuming "Survived" is the target variable
X = encode.drop("Survived", axis=1)
y = encode["Survived"]

In [172]:
from sklearn.model_selection import train_test_split

In [173]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [174]:
from sklearn.preprocessing import StandardScaler

In [175]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training

In [176]:
from sklearn.linear_model import LogisticRegression

In [177]:
# Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Prediction & Accuracy

In [178]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

In [179]:
y_pred = model.predict(X_test)

In [None]:
# Comparison table between the actual target values from your test data and the predictions our model made
data_pred = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
data_pred

In [181]:
accuracy = accuracy_score(y_test, y_pred)*100

In [None]:
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# ROC Curve & AUC
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()