In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head()


In [None]:
train_df.isnull().sum()


In [None]:
# Age - fill with median
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

# Embarked - fill with most common
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Drop Cabin (too many missing values)
train_df.drop('Cabin', axis=1, inplace=True)

# Verify
train_df.isnull().sum()


In [None]:
sns.countplot(x='Survived', data=train_df)
plt.title("Survived Count (0 = No, 1 = Yes)")

sns.countplot(x='Survived', hue='Sex', data=train_df)
plt.title("Survival by Gender")


In [None]:
from sklearn.preprocessing import LabelEncoder

# Make a copy first
df = train_df.copy()

le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])         # male=1, female=0
df['Embarked'] = le.fit_transform(df['Embarked']) # S=2, C=0, Q=1
df['Pclass'] = df['Pclass'].astype(int)

# Drop unused columns
df.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Split data
X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_val)

# Accuracy
print("Accuracy:", accuracy_score(y_val, y_pred))
sns.heatmap(confusion_matrix(y_val, y_pred), annot=True)


In [None]:
# Preprocess test data similarly
test_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)
test_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
test_df.drop(['Cabin'], axis=1, inplace=True)

test_df['Sex'] = le.transform(test_df['Sex'])
test_df['Embarked'] = le.transform(test_df['Embarked'])

X_test = test_df.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

# Predict
test_pred = model.predict(X_test)

# Submission CSV
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_pred
})

submission.to_csv('submission.csv', index=False)
print("submission.csv ready!")
