In [1]:
# 1. Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up visualization styles
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

# 2. Loading the dataset
df = pd.read_csv('dataset/titanic.csv')

# Displaying the first 5 rows of the dataset
df.head()


In [2]:
# 3. Basic Information about Data
df.info()  # Structure, null values
df.describe()  # Statistical summary of numerical columns
df['Survived'].value_counts()  # Check survival distribution
df['Pclass'].value_counts()  # Check passenger class distribution
df['Sex'].value_counts()  # Check gender distribution


In [3]:
# 4. Handling Missing Values
df.isnull().sum()  # Count missing values in each column

# Visualizing missing data
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# Handling missing values
df.drop('Cabin', axis=1, inplace=True)  # Drop 'Cabin' due to too many missing
df['Age'].fillna(df['Age'].median(), inplace=True)  # Fill missing 'Age' with median
df.dropna(subset=['Embarked'], inplace=True)  # Drop rows with missing 'Embarked'


In [4]:
# 5. Univariate Analysis
# Plotting histograms for numerical features
df.hist(bins=20, figsize=(20, 15), color='steelblue')
plt.suptitle('Distribution of Numerical Features')
plt.show()

# Boxplots to detect outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[['Age', 'Fare']])
plt.title('Boxplot for Age and Fare')
plt.show()


In [5]:
# 6. Bivariate Analysis
# Survival based on Gender
sns.countplot(x='Survived', hue='Sex', data=df, palette='Set2')
plt.title('Survival Count based on Gender')
plt.show()

# Survival based on Passenger Class
sns.countplot(x='Survived', hue='Pclass', data=df, palette='Set1')
plt.title('Survival Count based on Pclass')
plt.show()

# Fare vs Survival Barplot
sns.barplot(x='Survived', y='Fare', data=df, palette='Blues')
plt.title('Fare vs Survival')
plt.show()


In [6]:
# 7. Multivariate Analysis
# Checking correlations between numerical features
correlation = df.corr()

# Visualizing correlations using heatmap
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Pairplot for selected features
sns.pairplot(df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare']], hue='Survived', palette='husl')
plt.suptitle('Pairplot Analysis', y=1.02)
plt.show()


In [7]:
# 8. Extra Visuals
# Pie Chart: Survival Distribution
survival_counts = df['Survived'].value_counts()
labels = ['Survived', 'Not Survived']
plt.pie(survival_counts, labels=labels, autopct='%1.1f%%', startangle=90, colors=['green', 'red'])
plt.title('Survival Distribution')
plt.show()

# Pie Chart: Gender Distribution
gender_counts = df['Sex'].value_counts()
gender_labels = ['Female', 'Male']
plt.pie(gender_counts, labels=gender_labels, autopct='%1.1f%%', startangle=90, colors=['blue', 'orange'])
plt.title('Gender Distribution')
plt.show()


# 9. Key Observations:
- **Gender**: Females had a higher survival rate compared to males.
- **Passenger Class**: Passengers in 1st class had higher survival rates than those in 3rd class.
- **Age**: Younger passengers had a better chance of survival.
- **Fare**: Passengers who paid higher fares had a higher survival rate.
- **Correlations**: Strong correlation between `Fare`, `Pclass`, and `Survived`.

---
# 10. Summary of Findings:
- **Key Insight**: Gender, class, and fare were strong indicators of survival.
- The project helped understand how missing data can be handled and the power of visualizing relationships to drive insights.
