In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [None]:
# Load the dataset
df = pd.read_csv("tested.csv")  # Adjust the filename if necessary

# Display dataset information
print(df.info())
print(df.head())


In [None]:
# Check missing values
print(df.isnull().sum())

# Fill missing age values with median age
df["Age"].fillna(df["Age"].median(), inplace=True)

# Fill missing embarked values with the most common port
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

# Drop the 'Cabin' column due to too many missing values
df.drop(columns=["Cabin"], inplace=True)


In [None]:
# Encode categorical features
label_encoders = {}
categorical_cols = ["Sex", "Embarked"]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for later use


In [None]:
scaler = StandardScaler()
df["Fare"] = scaler.fit_transform(df[["Fare"]])  # Scale fare feature


In [None]:
# Extract title from names
df["Title"] = df["Name"].apply(lambda name: name.split(",")[1].split(".")[0].strip())

# Group rare titles
rare_titles = ["Lady", "Countess", "Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona"]
df["Title"] = df["Title"].replace(rare_titles, "Rare")

# Encode titles
df["Title"] = LabelEncoder().fit_transform(df["Title"])

# Family size
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1


In [None]:
# Define features and target variable
X = df.drop(columns=["Survived", "Name", "Ticket", "PassengerId"])  # Features
y = df["Survived"]  # Target variable

# Split data into train and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [None]:
xgb_model = XGBClassifier(objective="binary:logistic", n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)


In [None]:
# Predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluation Metrics
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n🔹 {model_name} Evaluation:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")


In [None]:
# Extract feature importance from Random Forest
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False).plot(kind="bar", figsize=(12, 6))
plt.title("Feature Importance in Titanic Survival Prediction")
plt.show()


In [None]:
def predict_survival(model, pclass, sex, age, sibsp, parch, fare, embarked, title, family_size):
    # Convert categorical inputs using the stored label encoders
    sex_encoded = label_encoders["Sex"].transform([sex])[0]
    embarked_encoded = label_encoders["Embarked"].transform([embarked])[0]

    # Standardize fare
    fare_scaled = scaler.transform([[fare]])[0][0]

    # Create input array
    input_data = np.array([[pclass, sex_encoded, age, sibsp, parch, fare_scaled, embarked_encoded, title, family_size]])
    
    # Make prediction
    predicted_survival = model.predict(input_data)[0]
    return "Survived" if predicted_survival == 1 else "Not Survived"

# Example 
predicted_status = predict_survival(rf_model, 3, "male", 25, 0, 0, 50, "S", 2, 1)
print(f"\nPredicted Survival Status: {predicted_status}")


In [None]:
# Save the trained models and encoders
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(xgb_model, "xgboost_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(scaler, "scaler.pkl")
