In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../data/Titanic-Dataset.csv")

In [None]:
def wrangle(filepath):
    df = pd.read_csv(filepath)

    if "Survived" in df.columns:
        df = df.drop(columns="Survived")

    sda = pd.get_dummies(df["Sex"], drop_first=True).astype("int64")
    df = pd.concat([sda, df], axis=1)

    df.drop(columns="Sex", inplace=True)
    df.rename(columns={"male": "gender"}, inplace=True)

    df.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)
    df.drop(columns="Cabin", inplace=True)

    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["Age"] = df["Age"].astype(int)

    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())

    df = pd.get_dummies(df, columns=["Embarked"], drop_first=True, dtype=int)

    return df


In [None]:
def split_data(df, target_column):
    X = df.drop(target_column, axis = 1)
    y = df[target_column]
    print(X.shape, y.shape, end = "\n")
    return X, y

In [None]:
from sklearn.model_selection import train_test_split

def splitting_train_test(x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)
    return x_train, x_test, y_train, y_test

In [None]:
df

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df["Survived"].value_counts()

In [None]:
sns.countplot(x = "Survived", data = df)

In [None]:
df.describe(include="object")

In [None]:
df.duplicated().sum()

### Data transformations

In [None]:
sda = pd.get_dummies(df["Sex"], drop_first = True)
sda = sda.astype("int64")
sda

In [None]:
df = pd.concat([sda, df], axis = 1)

In [None]:
df.drop(columns = "Sex", inplace = True)
df.rename(columns={"male": "gender"}, inplace = True)

In [None]:
df.drop(["PassengerId", "Name", "Ticket"],axis = 1,  inplace = True)

### Data cleaning

In [None]:
sns.histplot(data = df, x = "Age")
plt.show()

In [None]:
df["Age"].fillna(df["Age"].median(), inplace = True)

In [None]:
df["Age"]= df["Age"].astype(int)

In [None]:
sns.kdeplot(data = df, x = "Age")
plt.show()

In [None]:
df.drop(columns = "Cabin", inplace = True)

In [None]:
sns.heatmap(df.isnull())
plt.title('Check Missing data')

In [None]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [None]:
# One-hot encoding for Embarked
# drop_first=True prevents multicollinearity (dummy variable trap)
df = pd.get_dummies(df, columns=["Embarked"], drop_first=True, dtype=int)
df.head()

### Split features & Target

In [None]:
X, y = split_data(df, "Survived")
x_train, x_test, y_train, y_test = splitting_train_test(X, y)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_test)

In [None]:
accuracy = accuracy_score(y_pred, y_test)*100
accuracy

In [None]:
accuracy_train=log_reg.score(x_train,y_train)*100
accuracy_train

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train, y_train)
rf_pred = rf_model.predict(x_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_pred) * 100:.2f}%")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.1)
gb_model.fit(x_train, y_train)
gb_pred = gb_model.predict(x_test)
print(f"Gradient Boost Accurcy: {accuracy_score(y_test, gb_pred) * 100:.2f}%")

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize Grid Search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')

# Fit to training data
grid_search.fit(x_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_ * 100:.2f}%")

In [None]:
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(x_test)
print(f"Test Set Accuracy with Tuned Model: {accuracy_score(y_test, y_pred_best) * 100:.2f}%")

In [None]:
# Get feature importances
feature_scores = pd.Series(best_rf_model.feature_importances_, index=x_train.columns).sort_values(ascending=False)

# Visualize
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_scores, y=feature_scores.index)
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

In [None]:
df = wrangle()

In [None]:
df.isna().sum()

In [None]:
df

In [None]:
X_test_kaggle = wrangle("../data/tested.csv")
X_test_kaggle = X_test_kaggle.reindex(columns=X.columns, fill_value=0)

preds = best_rf_model.predict(X_test_kaggle)

In [None]:
preds

In [None]:
pd.Series(preds).value_counts(normalize=True)
