# Exploratory Data Analysis: Titanic Dataset

## 1. Data Loading and Initial Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("/home/ubuntu/upload/titanic.csv")

print("### First 5 Rows of the Dataset")
print(df.head().to_markdown(index=False))

print("
### Data Information")
print(df.info())

print("
### Descriptive Statistics")
print(df.describe().to_markdown())

## 2. Data Quality Assessment

### Missing Values

In [None]:
print(df.isnull().sum().to_markdown())

### Data Types

In [None]:
print(df.dtypes.to_markdown())

### Unique Values in Categorical Columns

In [None]:
for column in df.select_dtypes(include=["object"]).columns:
    print(f"\nUnique values for {column}:")
    print(df[column].unique())

## 3. Distribution Analysis and Visualizations

### Histograms of Age and Fare

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df["Age"].dropna(), kde=True)
plt.title("Distribution of Age")

plt.subplot(1, 2, 2)
sns.histplot(df["Fare"], kde=True)
plt.title("Distribution of Fare")
plt.tight_layout()
plt.show()

### Box Plots of Age and Fare

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.boxplot(y=df["Age"])
plt.title("Box Plot of Age")

plt.subplot(1, 2, 2)
sns.boxplot(y=df["Fare"])
plt.title("Box Plot of Fare")
plt.tight_layout()
plt.show()

### Survival Count

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x="Survived", data=df)
plt.title("Survival Count")
plt.show()

### Categorical Feature Counts

In [None]:
plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.countplot(x="Sex", data=df)
plt.title("Count of Passengers by Sex")

plt.subplot(1, 3, 2)
sns.countplot(x="Pclass", data=df)
plt.title("Count of Passengers by Pclass")

plt.subplot(1, 3, 3)
sns.countplot(x="Embarked", data=df)
plt.title("Count of Passengers by Embarked Port")
plt.tight_layout()
plt.show()

## 4. Outlier Detection and Analysis

In [None]:
for col in ["Age", "Fare"]:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"\nOutliers in {col}:")
    print(outliers[["PassengerId", col]].to_markdown(index=False))

## 5. Correlation and Relationship Analysis

### Correlation Matrix

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.select_dtypes(include=["int64", "float64"]).corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

### Survival Rate by Categorical Features

In [None]:
plt.figure(figsize=(18, 5))
plt.subplot(1, 3, 1)
sns.barplot(x="Sex", y="Survived", data=df)
plt.title("Survival Rate by Sex")

plt.subplot(1, 3, 2)
sns.barplot(x="Pclass", y="Survived", data=df)
plt.title("Survival Rate by Pclass")

plt.subplot(1, 3, 3)
sns.barplot(x="Embarked", y="Survived", data=df)
plt.title("Survival Rate by Embarked Port")
plt.tight_layout()
plt.show()

## 6. Advanced Analysis and Insights

### Age Distribution by Sex and Survival

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x="Sex", y="Age", hue="Survived", data=df, split=True)
plt.title("Age Distribution by Sex and Survival")
plt.show()

### Age Distribution by Pclass and Survival

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x="Pclass", y="Age", hue="Survived", data=df, split=True)
plt.title("Age Distribution by Pclass and Survival")
plt.show()