## Titanic Statistics and Hypothesis Analysis

In [1]:
import pandas as pd # importing pandas library 
import numpy as np  # importing numpy library
from scipy import stats

# Loading the  dataset ( train.xlxs is the titanic dataset)
df = pd.read_excel("train.xlsx")

# Keeping only the columns we need for analysis
cols = ["Survived", "Pclass", "Sex", "Age"]
df = df[cols].copy()

# checking the length and "survival" coloumn to be unique
n = len(df)
assert set(df["Survived"].dropna().unique()) <= {0, 1}, "Survived must be binary 0/1"
print(f"Total passengers (rows): {n}")

Total passengers (rows): 891


In [2]:
print(df.info()) # getting information  of the dataset
print(df.isnull().sum()) # checking for null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 28.0+ KB
None
Survived      0
Pclass        0
Sex           0
Age         177
dtype: int64


In [3]:
# creating a probability function
def prob(series_bool):
    return series_bool.mean()

# Marginal probabilities
p_survive = prob(df["Survived"] == 1)
p_not_survive = 1 - p_survive

p_male = prob(df["Sex"] == "male")
p_female = 1 - p_male

p_c1 = prob(df["Pclass"] == 1)
p_c2 = prob(df["Pclass"] == 2)
p_c3 = prob(df["Pclass"] == 3)

print("\n--- Marginal probabilities ---")
print(f"P(Survive=1) = {p_survive:.4f}")
print(f"P(Survive=0) = {p_not_survive:.4f}")
print(f"P(Male)      = {p_male:.4f}")
print(f"P(Female)    = {p_female:.4f}")
print(f"P(Pclass=1)  = {p_c1:.4f}")
print(f"P(Pclass=2)  = {p_c2:.4f}")
print(f"P(Pclass=3)  = {p_c3:.4f}")

# Conditional probabilities: survival by sex
p_survive_given_male = prob((df["Survived"] == 1) & (df["Sex"] == "male")) / p_male
p_survive_given_female = prob((df["Survived"] == 1) & (df["Sex"] == "female")) / p_female

# Conditional probabilities: survival by class
p_survive_given_c1 = prob((df["Survived"] == 1) & (df["Pclass"] == 1)) / p_c1
p_survive_given_c2 = prob((df["Survived"] == 1) & (df["Pclass"] == 2)) / p_c2
p_survive_given_c3 = prob((df["Survived"] == 1) & (df["Pclass"] == 3)) / p_c3

print("\n--- Conditional probabilities ---")
print(f"P(Survive=1 | Male)      = {p_survive_given_male:.4f}")
print(f"P(Survive=1 | Female)    = {p_survive_given_female:.4f}")
print(f"P(Survive=1 | Pclass=1)  = {p_survive_given_c1:.4f}")
print(f"P(Survive=1 | Pclass=2)  = {p_survive_given_c2:.4f}")
print(f"P(Survive=1 | Pclass=3)  = {p_survive_given_c3:.4f}")

# Joint probabilities
p_male_and_survive = prob((df["Sex"] == "male") & (df["Survived"] == 1))
p_female_and_survive = prob((df["Sex"] == "female") & (df["Survived"] == 1))
p_c1_and_survive = prob((df["Pclass"] == 1) & (df["Survived"] == 1))
p_c2_and_survive = prob((df["Pclass"] == 2) & (df["Survived"] == 1))
p_c3_and_survive = prob((df["Pclass"] == 3) & (df["Survived"] == 1))

print("\n--- Joint probabilities ---")
print(f"P(Male ∩ Survive=1)     = {p_male_and_survive:.4f}")
print(f"P(Female ∩ Survive=1)   = {p_female_and_survive:.4f}")
print(f"P(Pclass=1 ∩ Survive=1) = {p_c1_and_survive:.4f}")
print(f"P(Pclass=2 ∩ Survive=1) = {p_c2_and_survive:.4f}")
print(f"P(Pclass=3 ∩ Survive=1) = {p_c3_and_survive:.4f}")


--- Marginal probabilities ---
P(Survive=1) = 0.3838
P(Survive=0) = 0.6162
P(Male)      = 0.6476
P(Female)    = 0.3524
P(Pclass=1)  = 0.2424
P(Pclass=2)  = 0.2065
P(Pclass=3)  = 0.5511

--- Conditional probabilities ---
P(Survive=1 | Male)      = 0.1889
P(Survive=1 | Female)    = 0.7420
P(Survive=1 | Pclass=1)  = 0.6296
P(Survive=1 | Pclass=2)  = 0.4728
P(Survive=1 | Pclass=3)  = 0.2424

--- Joint probabilities ---
P(Male ∩ Survive=1)     = 0.1223
P(Female ∩ Survive=1)   = 0.2615
P(Pclass=1 ∩ Survive=1) = 0.1526
P(Pclass=2 ∩ Survive=1) = 0.0976
P(Pclass=3 ∩ Survive=1) = 0.1336


In [4]:
# Defining adult and child using Age; removing missing Age for these calculations
age_df = df.dropna(subset=["Age"]).copy()
age_df["is_adult"] = (age_df["Age"] >= 18)

# Conditional survival by adult/child
p_survive_given_adult = prob((age_df["Survived"] == 1) & (age_df["is_adult"])) / prob(age_df["is_adult"])
p_survive_given_child = prob((age_df["Survived"] == 1) & (~age_df["is_adult"])) / prob(~age_df["is_adult"])

print("\n--- Adult vs Child survival  ---")
print(f"P(Survive=1 | Adult) = {p_survive_given_adult:.4f}")
print(f"P(Survive=1 | Child) = {p_survive_given_child:.4f}")

# Independence check: Survived vs Pclass via comparison and chi-square
print("\n--- Independence check: Survived vs Pclass ---")
print(f"Overall P(Survive=1): {p_survive:.4f}")
print(f"P(Survive=1 | C1): {p_survive_given_c1:.4f}, |C2: {p_survive_given_c2:.4f}, |C3: {p_survive_given_c3:.4f}")
# If these differ materially from the overall rate, they are not independent.

# Chi-square test of independence between Survived and Pclass
ct_class = pd.crosstab(df["Survived"], df["Pclass"])
chi2, p_chi2, dof, expected = stats.chi2_contingency(ct_class)
print("\nChi-square test Survived vs Pclass")
print(ct_class)
print(f"chi2={chi2:.4f}, dof={dof}, p-value={p_chi2:.6g}")


--- Adult vs Child survival  ---
P(Survive=1 | Adult) = 0.3810
P(Survive=1 | Child) = 0.5398

--- Independence check: Survived vs Pclass ---
Overall P(Survive=1): 0.3838
P(Survive=1 | C1): 0.6296, |C2: 0.4728, |C3: 0.2424

Chi-square test Survived vs Pclass
Pclass      1   2    3
Survived              
0          80  97  372
1         136  87  119
chi2=102.8890, dof=2, p-value=4.54925e-23


In [5]:
# Contingency table: Survived by Sex
ct_sex = pd.crosstab(df["Sex"], df["Survived"])
chi2_sex, p_sex, dof_sex, exp_sex = stats.chi2_contingency(ct_sex)

print("\n--- Hypothesis 1: Male vs Female survival rate equality ---")
print(ct_sex)
print(f"Chi-square test: chi2={chi2_sex:.4f}, dof={dof_sex}, p-value={p_sex:.6g}")
if p_sex < 0.05:
    print("Conclusion: Reject H0 at α=0.05 — survival rate differs by sex.")
else:
    print("Conclusion: Fail to reject H0 at α=0.05 — no evidence of difference by sex.")


--- Hypothesis 1: Male vs Female survival rate equality ---
Survived    0    1
Sex               
female     81  233
male      468  109
Chi-square test: chi2=260.7170, dof=1, p-value=1.19736e-58
Conclusion: Reject H0 at α=0.05 — survival rate differs by sex.


In [6]:
# Build counts for proportion test (1st vs 3rd class)
c1 = df[df["Pclass"] == 1]
c3 = df[df["Pclass"] == 3]
x1 = c1["Survived"].sum()  # successes in group 1
n1 = len(c1)
x3 = c3["Survived"].sum()  # successes in group 3
n3 = len(c3)

# Two-proportion z (manually)
p1 = x1 / n1
p3 = x3 / n3
p_pool = (x1 + x3) / (n1 + n3)
se = np.sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n3)) # Standard error : Measures the variability in the difference between survivor rates.
z = (p1 - p3) / se  # measures how many standard errors p1 is above p3.
# One-sided p-value for H1: p1 > p3
p_one_sided = 1 - stats.norm.cdf(z) # the probability of getting a result at least as extreme as observed, assuming first-class survival is not greater.

print("\n--- Hypothesis 2: First class survival higher than third class ---")
print(f"Class 1: x={x1}, n={n1}, p̂1={p1:.4f} | Class 3: x={x3}, n={n3}, p̂3={p3:.4f}")
print(f"z={z:.4f}, one-sided p-value={p_one_sided:.6g}")
if p_one_sided < 0.05:
    print("Conclusion: Reject H0 at α=0.05 — First-class survival rate is higher than third-class.")
else:
    print("Conclusion: Fail to reject H0 at α=0.05 — insufficient evidence that first-class > third-class.")


--- Hypothesis 2: First class survival higher than third class ---
Class 1: x=136, n=216, p̂1=0.6296 | Class 3: x=119, n=491, p̂3=0.2424
z=9.8775, one-sided p-value=0
Conclusion: Reject H0 at α=0.05 — First-class survival rate is higher than third-class.


- Survival rate differs sharply by sex: females have a much higher P(\text{Survive}=1) than males.
- Survival rate differs by class: first > second > third. This also means survival and class are not independent.
- Children often have a higher survival rate than adults.
- Hypothesis tests will produce very small p-values, leading to:
- Rejecting “male survival equals female survival.”
- Concluding “first-class survival is higher than third-class.
