In [None]:
import pandas as pd

# Load the CSV files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
gender_submission_df = pd.read_csv('gender_submission.csv')

# Display the first few rows of each dataframe
print("Train DataFrame:")
print(train_df.head())
print("\nTest DataFrame:")
print(test_df.head())
print("\nGender Submission DataFrame:")
print(gender_submission_df.head())


In [None]:
# Check for missing values in train and test data
print("Missing values in Train DataFrame:")
print(train_df.isnull().sum())

print("\nMissing values in Test DataFrame:")
print(test_df.isnull().sum())

# Fill missing 'Age' values with the median
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

# Fill missing 'Embarked' values with the mode
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the distribution of 'Age' in the train dataset
sns.histplot(train_df['Age'], kde=True)
plt.title('Age Distribution in Train Dataset')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Plot the distribution of 'Fare' in the train dataset
sns.histplot(train_df['Fare'], kde=True)
plt.title('Fare Distribution in Train Dataset')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Plot survival rate by 'Sex'
sns.barplot(x='Sex', y='Survived', data=train_df)
plt.title('Survival Rate by Sex')
plt.xlabel('Sex (0: Male, 1: Female)')
plt.ylabel('Survival Rate')
plt.show()


In [None]:
# Plot survival rate by 'Pclass'
sns.barplot(x='Pclass', y='Survived', data=train_df)
plt.title('Survival Rate by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Survival Rate')
plt.show()


In [None]:
# Check the data types of columns
print("Data types of columns:")
print(train_df.dtypes)

In [None]:
# Drop non-numeric columns that cannot be used in correlation analysis
non_numeric_cols = ['Name', 'Ticket', 'Cabin', 'PassengerId']
train_df_cleaned = train_df.drop(columns=non_numeric_cols, errors='ignore')

# Convert categorical variables to numeric if necessary (example for 'Embarked')
if 'Embarked' in train_df_cleaned.columns:
    train_df_cleaned['Embarked'] = train_df_cleaned['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Select only numeric columns
numeric_cols = train_df_cleaned.select_dtypes(include=['number']).columns

# Compute the correlation matrix
corr = train_df_cleaned[numeric_cols].corr()

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
# Scatter plot of 'Age' vs. 'Fare' colored by survival status
sns.scatterplot(x='Age', y='Fare', hue='Survived', data=train_df, palette='coolwarm')
plt.title('Age vs. Fare by Survival Status')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()
