In [None]:
import pandas as pd
titanic = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(url)

# Quick peek
print("Rows, Columns:", titanic.shape)
display(titanic.head())

def data_quality_report(df):
    report = pd.DataFrame({
        "dtype": df.dtypes.astype(str),
        "n_missing": df.isna().sum(),
        "pct_missing": df.isna().mean().round(3) * 100,
        "n_unique": df.nunique(dropna=True),
    })
    # Basic stats for numeric columns
    numeric_stats = df.select_dtypes(include=['number']).describe().T[['count','mean','std','min','25%','50%','75%','max']]
    return report, numeric_stats

dq_report, numeric_stats = data_quality_report(titanic)
print("Data Quality Summary (top):")
display(dq_report)
print("Numeric summary:")
display(numeric_stats)


df = titanic.copy()

# Standardize column names (optional)
df.columns = [c.strip() for c in df.columns]

# Strip whitespace in strings and fix types
for c in ['Name','Sex','Ticket','Cabin','Embarked']:
    df[c] = df[c].astype(object).where(df[c].notna(), None)
    df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)

# Fill obvious possible data problems
# PassengerId is unique; keep it for traceability
assert df['PassengerId'].is_unique

# Convert 'Age' to numeric
print("Missing counts:\n", df[['Age','Cabin','Embarked']].isna().sum())

# 2.1 Impute Age with median by Title


def add_features(df):
    df = df.copy()


    def extract_title(name):
        if pd.isna(name): return "None"
        try:
            t = name.split(',')[1].split('.')[0].strip()
        except Exception:
            t = "None"
        return t

    df['Title'] = df['Name'].map(extract_title)
    # group rare titles
    rare_titles = ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
    def map_title(t):
        if t in ['Mr','Mrs','Miss','Master']:
            return t
        if t in rare_titles:
            return 'Rare'
        return 'Other'
    df['Title'] = df['Title'].map(map_title)


    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    def extract_deck(cabin):
        if pd.isna(cabin): return np.nan
        return str(cabin)[0]
    df['Deck'] = df['Cabin'].map(extract_deck)

    # 3.4 Fare per person: Fare divided by family size (avoid div by zero)
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']

    df['AgeGroup'] = pd.cut(df['Age'], bins=[-1,0,5,12,18,35,60,120],
                            labels=['Unknown','Baby','Child','Teen','YoungAdult','Adult','Senior'])


    return df

df = add_features(df)
display(df[['Name','Title','FamilySize','IsAlone','Deck','Fare','FarePerPerson']].head())

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# use median age per Title (Mr/Miss/Mrs/Master/Rare/Other)
age_med_by_title = df.groupby('Title')['Age'].median()
def impute_age(row):
    if pd.isna(row['Age']):
        t = row['Title']
        if pd.isna(age_med_by_title.get(t, np.nan)):
            return df['Age'].median()
        return age_med_by_title[t]
    else:
        return row['Age']

df['Age'] = df.apply(impute_age, axis=1)

df['AgeGroup'] = pd.cut(df['Age'], bins=[-1,5,12,18,35,60,120],
                       labels=['Baby','Child','Teen','YoungAdult','Adult','Senior'])

df['Fare'] = df.groupby('Pclass')['Fare'].apply(lambda x: x.fillna(x.median()))

df['Deck'] = df['Deck'].fillna('U')

# Encoding for modeling/analysis

cat_cols = ['Sex','Embarked','Title','Deck','Pclass','AgeGroup']
num_cols = ['Age','FarePerPerson','SibSp','Parch','FamilySize']

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded = pd.get_dummies(df[cat_cols].astype(str), drop_first=False)

df_encoded = pd.concat([df.reset_index(drop=True), encoded.reset_index(drop=True)], axis=1)

clean_path = "/mnt/data/titanic_cleaned.csv"

model_ready_cols = ['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked',
                    'Title','FamilySize','IsAlone','Deck','FarePerPerson','AgeGroup']
df[model_ready_cols].to_csv(clean_path, index=False)
print("Saved cleaned dataset to:", clean_path)


# Survival rates by Sex
survival_by_sex = df.groupby('Sex')['Survived'].agg(['count','sum'])
survival_by_sex['survival_rate'] = (survival_by_sex['sum']/survival_by_sex['count']).round(3)
print("\nSurvival by sex:\n")
display(survival_by_sex)

# Survival rates by Pclass
survival_by_pclass = df.groupby('Pclass')['Survived'].agg(['count','sum'])
survival_by_pclass['survival_rate'] = (survival_by_pclass['sum']/survival_by_pclass['count']).round(3)
print("\nSurvival by Pclass:\n")
display(survival_by_pclass)

# Survival by AgeGroup
survival_by_agegroup = df.groupby('AgeGroup')['Survived'].agg(['count','sum'])
survival_by_agegroup['survival_rate'] = (survival_by_agegroup['sum']/survival_by_agegroup['count']).round(3)
print("\nSurvival by AgeGroup:\n")
display(survival_by_agegroup)

# Survival by Fare bins
df['FareBin'] = pd.qcut(df['FarePerPerson'].fillna(0)+1e-6, 4, labels=['Low','Med','High','VeryHigh'])
survival_by_fare = df.groupby('FareBin')['Survived'].agg(['count','sum'])
survival_by_fare['survival_rate'] = (survival_by_fare['sum']/survival_by_fare['count']).round(3)
print("\nSurvival by FareBin:\n")
display(survival_by_fare)

# survival distribution across Sex and Pclass (matplotlib only)
plt.figure(figsize=(6,4))
survival_by_sex['survival_rate'].plot(kind='bar')
plt.title('Survival rate by Sex')
plt.ylabel('Survival rate')
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,4))
survival_by_pclass['survival_rate'].plot(kind='bar')
plt.title('Survival rate by Pclass')
plt.ylabel('Survival rate')
plt.tight_layout()
plt.show()


bias_report = """
Potential biases and ethical considerations to document and check:

1) Historical/societal bias:
   - The dataset reflects 1912 social norms: e.g., "women and children first" rescue practices,
     class-based differences (Pclass correlates with wealth/priority), and nationality/ethnicity differences.
   - Using the model's predictions for modern decisions or to infer moral claims is inappropriate.

2) Missingness bias:
   - Age and Cabin are Missing Not At Random (MNAR) in many cases: older or lower-class passengers
     may have sparser cabin records, which can bias any analysis relying on Cabin.

3) Survivorship and sample bias:
   - The dataset includes only passengers on Titanic; it is not representative of general population.

4) Label bias:
   - 'Survived' is a factual label, but any downstream use (e.g., automated triage simulation) should
     consider ethical implications, fairness, and historical context.

Checks to perform:
- Compare survival rate across protected-like attributes (Sex, AgeGroup, Pclass).
- Test whether imputations materially change group-level survival rates (sensitivity analysis).
- If building predictive models, run fairness metrics (demographic parity, equalized odds) across Sex and Pclass.
"""

print(bias_report)


missing_after = pd.DataFrame({
    'n_missing_after': df.isna().sum(),
    'pct_missing_after': df.isna().mean().round(3)*100
})
print("Missingness after imputation (top rows):")
display(missing_after[missing_after['n_missing_after']>0].sort_values('n_missing_after', ascending=False).head(10))


print("\nPipeline complete. Key outputs:")
print("- Cleaned file saved to:", clean_path)
print("- Data quality report (dq_report) available; numeric stats available.")
print("- Tables: survival_by_sex, survival_by_pclass, survival_by_agegroup, survival_by_fare.")


Rows, Columns: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data Quality Summary (top):


Unnamed: 0,dtype,n_missing,pct_missing,n_unique
PassengerId,int64,0,0.0,891
Survived,int64,0,0.0,2
Pclass,int64,0,0.0,3
Name,object,0,0.0,891
Sex,object,0,0.0,2
Age,float64,177,19.9,88
SibSp,int64,0,0.0,7
Parch,int64,0,0.0,7
Ticket,object,0,0.0,681
Fare,float64,0,0.0,248


Numeric summary:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


Missing counts:
 Age         177
Cabin       687
Embarked      2
dtype: int64


Unnamed: 0,Name,Title,FamilySize,IsAlone,Deck,Fare,FarePerPerson
0,"Braund, Mr. Owen Harris",Mr,2,0,,7.25,3.625
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs,2,0,C,71.2833,35.64165
2,"Heikkinen, Miss. Laina",Miss,1,1,,7.925,7.925
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs,2,0,C,53.1,26.55
4,"Allen, Mr. William Henry",Mr,1,1,,8.05,8.05


TypeError: incompatible index of inserted column with frame index