In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("train.csv")

# First look
print(df.head())


In [None]:
print(df.info())


In [None]:
print(df.describe())


In [None]:
sns.countplot(x='Survived', data=df)
plt.show()

In [None]:
# Survival by Sex
sns.countplot(x='Survived', hue='Sex', data=df)
plt.title("Survival by Sex")
plt.show()


In [None]:
# Survival by Pclass
sns.countplot(x='Survived', hue='Pclass', data=df)
plt.title("Survival by Passenger Class")
plt.show()


In [None]:
# Age distribution of survivors
sns.histplot(df[df['Survived']==1]['Age'].dropna(), bins=30, color='green', kde=True)
# kernel density estimate
plt.title("Age Distribution of Survivors")
plt.show()

In [None]:

df['Age'].fillna(df['Age'].median(), inplace=True)
# Since mode() returns a Series, [0] selects the first (and usually only) most frequent value.
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})


In [None]:
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

Ah, great question! Let’s break it down **carefully**.

```python
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
```

This line is doing **one-hot encoding** of the categorical column `Embarked`. Here’s what each part means:

---

### **1️⃣ `pd.get_dummies(df, columns=['Embarked'])`**

* `pd.get_dummies()` converts **categorical variables** into **numeric dummy/indicator variables** (0 or 1), because most ML models in Scikit-Learn **cannot work with strings**.
* Example: Suppose `Embarked` has values: `['C', 'Q', 'S']`.
  After `get_dummies`, it becomes three new columns:

| Embarked_C | Embarked_Q | Embarked_S |
| ---------- | ---------- | ---------- |
| 1          | 0          | 0          |
| 0          | 1          | 0          |
| 0          | 0          | 1          |

---

### **2️⃣ `drop_first=True`**

* This **drops the first dummy column** to avoid **multicollinearity**.
* Multicollinearity is when one column can be perfectly predicted from the others — it can confuse models like **linear regression**.
* After `drop_first=True`, the first column (`Embarked_C`) is dropped, and the table becomes:

| Embarked_Q | Embarked_S |
| ---------- | ---------- |
| 0          | 0          |
| 1          | 0          |
| 0          | 1          |

* **Interpretation:**

  * If both `Embarked_Q` and `Embarked_S` are 0 → it must be `Embarked_C`.
  * This keeps the information intact but avoids redundant columns.

---

### **3️⃣ `df =`**

* This **assigns the new DataFrame** with the encoded columns back to `df`.

---

✅ **Summary:**
This line converts the categorical `Embarked` column into numeric dummy variables for ML, while avoiding redundancy by dropping the first category.

---

If you want, I can **draw a small diagram showing how `get_dummies` works visually**—it makes it much easier to remember. Do you want me to do that?

In [None]:
df.head(1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Features & target
X = df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked_Q','Embarked_S']]
y = df['Survived']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.show()


In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.sort_values().plot(kind='barh')
plt.title("Feature Importance")
plt.show()
