In [2]:
import pandas as pd
import re
def clean_and_engineer_features(data):
    data = data.copy() 
    data["Age"] = data["Age"].fillna(data["Age"].median())
    data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode()[0])
    data["Fare"] = data["Fare"].fillna(data["Fare"].median())
    data = data.drop(columns=["Cabin"], errors='ignore')
    data["Sex"] = data["Sex"].map({"male": 1, "female": 0})
    data = pd.get_dummies(data, columns=["Embarked"], drop_first=True)
    data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
    data["Title"] = data["Name"].apply(lambda x: re.search(r' ([A-Za-z]+)\.', x).group(1) if re.search(r' ([A-Za-z]+)\.', x) else "Other")
    common_titles = ["Mr", "Miss", "Mrs", "Master"]
    data["Title"] = data["Title"].apply(lambda x: x if x in common_titles else "Other")
    data = data.drop(columns=["Name", "Ticket", "PassengerId"], errors='ignore')
    return data


In [3]:
import scipy.stats as stats
titanic_file_path = "titanic_dataset.csv"
df = pd.read_csv(titanic_file_path)
df = clean_and_engineer_features(df)
table = pd.crosstab(df['Sex'], df['Survived'])  
chi2, p_value, dof, expected = stats.chi2_contingency(table)
print("Chi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")
alpha = 0.05 
if p_value < alpha:
    print("Conclusion: Gender is a statistically significant feature.")
else:
    print("Conclusion: Gender is NOT a statistically significant feature.")


Chi-Square Test Results:
Chi-Square Statistic: 413.6897
P-value: 0.0000
Conclusion: Gender is a statistically significant feature.
