In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, chisquare


try:
    titanic_df = pd.read_csv('/content/Titanic-Dataset.csv')
except FileNotFoundError:
    print("Error: The file 'Titanic-Dataset.csv' was not found at '/content/'. Please upload the file or provide the correct path.")

    exit()



titanic_df = titanic_df.dropna(subset=['Age', 'Embarked'])


simple_random_sample = titanic_df['Age'].sample(n=100, random_state=42)
mean_srs = simple_random_sample.mean()


strata = titanic_df.groupby('Survived')['Age']

min_samples_per_stratum = strata.size().min() if not strata.size().empty else 0
sample_size_stratified = min(50, min_samples_per_stratum)

if sample_size_stratified > 0:
  stratified_sample = strata.apply(lambda x: x.sample(n=sample_size_stratified, random_state=42))
  mean_stratified = stratified_sample.mean()
else:
  stratified_sample = pd.Series([])
  mean_stratified = np.nan



np.random.seed(42)
clusters = titanic_df['Pclass'].unique()
chosen_clusters = np.random.choice(clusters, size=2, replace=False)


cluster_sample = titanic_df[titanic_df['Pclass'].isin(chosen_clusters)]
mean_cluster = cluster_sample['Age'].mean()




bootstrap_means = []


if not titanic_df.empty:
  for _ in range(1000):
      bootstrap_sample = titanic_df['Age'].sample(n=100, replace=True, random_state=42)
      bootstrap_means.append(bootstrap_sample.mean())


  lower_bound = np.percentile(bootstrap_means, 2.5)
  upper_bound = np.percentile(bootstrap_means, 97.5)
else:
  lower_bound = np.nan
  upper_bound = np.nan



contingency_table = pd.crosstab(titanic_df['Pclass'], titanic_df['Survived'])
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)


embarked_counts = titanic_df['Embarked'].value_counts()
f_obs = np.array([embarked_counts.get('S', 0), embarked_counts.get('C', 0), embarked_counts.get('Q', 0)])



total_observed = f_obs.sum()
hypothetical_proportions = {'S': 0.5, 'C': 0.3, 'Q': 0.2}
f_exp = np.array([total_observed * hypothetical_proportions.get('S', 0),
                  total_observed * hypothetical_proportions.get('C', 0),
                  total_observed * hypothetical_proportions.get('Q', 0)])



if total_observed > 0 and np.isclose(f_exp.sum(), total_observed):
    chi2_stat_gof, p_val_gof = chisquare(f_obs, f_exp=f_exp)
else:
    print("Warning: Cannot perform Chi-square GOF test. Check observed and expected frequencies.")
    chi2_stat_gof = np.nan
    p_val_gof = np.nan



print(f"Simple Random Sampling Mean Age: {mean_srs:.2f}" if not pd.isna(mean_srs) else "Simple Random Sampling Mean Age: Not calculated (no data)")
print(f"Stratified Sampling Mean Age: {mean_stratified:.2f}" if not pd.isna(mean_stratified) else "Stratified Sampling Mean Age: Not calculated (not enough samples in strata)")
print(f"Cluster Sampling Mean Age: {mean_cluster:.2f}" if not pd.isna(mean_cluster) else "Cluster Sampling Mean Age: Not calculated (not enough samples in clusters)")
print(f"95% Confidence Interval for Mean Age (Bootstrapping): ({lower_bound:.2f}, {upper_bound:.2f})" if not pd.isna(lower_bound) else "95% Confidence Interval for Mean Age (Bootstrapping): Not calculated (no data)")


print(f"\nChi-square Test of Independence (Pclass vs Survived):")
print(f"Chi2 Stat: {chi2_stat:.2f}, p-value: {p_val:.4f}")

print(f"\nChi-square Goodness-of-Fit Test (Embarked Distribution):")
print(f"Chi2 Stat: {chi2_stat_gof:.2f}, p-value: {p_val_gof:.4f}" if not pd.isna(chi2_stat_gof) else "Chi-square Goodness-of-Fit Test (Embarked Distribution): Not calculated (invalid frequencies)")

Simple Random Sampling Mean Age: 29.70
Stratified Sampling Mean Age: 28.03
Cluster Sampling Mean Age: 29.57
95% Confidence Interval for Mean Age (Bootstrapping): (30.00, 30.00)

Chi-square Test of Independence (Pclass vs Survived):
Chi2 Stat: 91.08, p-value: 0.0000

Chi-square Goodness-of-Fit Test (Embarked Distribution):
Chi2 Stat: 234.75, p-value: 0.0000
