In [33]:
import pandas as pd
import os

def preprocess_data():
    # Load raw Titanic dataset
    df = pd.read_csv("data/raw/titanic.csv")

    print("Dataset Preview")
    print(df.head())

    print("\nSTARTING PREPROCESSING AND INSPECTION...")

    # Basic dataset inspection
    print("\nDataset Description")
    print(df.describe())

    print("\nDataset Shape")
    print(df.shape)

    print("\nDataset Info")
    print(df.info())

    # -------------------------------
    # Handling Missing Values
    # -------------------------------
    print("\nChecking missing values per column")
    print(df.isnull().sum())

    # Find columns that contain missing values
    null_clos = []
    value = df.isna().sum()

    for i in range(len(value)):
        if value.iloc[i] != 0:
            null_clos.append(df.columns[i])

    print("\nColumns with missing values:", null_clos)

    # Fill missing values
    for col in null_clos:
        if df[col].dtype == "object":
            # Fill categorical columns with mode
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            # Fill numerical columns with median
            df[col] = df[col].fillna(df[col].median())

    print("\nMissing values handled successfully")
    print(df.isna().sum())

    # -------------------------------
    # Encoding Categorical Variables
    # (Only basic encoding, NOT feature engineering)
    # -------------------------------
    print("\nEncoding categorical columns")

    if "Sex" in df.columns:
        df["Sex"] = df["Sex"].map({"male": 0, "female": 1})

    if "Embarked" in df.columns:
        df["Embarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2})

    print("Categorical encoding completed")

    # -------------------------------
    # Save processed dataset
    # -------------------------------
    PROCESSED_PATH = "data/processed"
    os.makedirs(PROCESSED_PATH, exist_ok=True)

    processed_file = os.path.join(PROCESSED_PATH, "processed.csv")
    df.to_csv(processed_file, index=False)

    print("\nPreprocessed data saved to:", processed_file)


# Run preprocessing only when the script is executed directly
if __name__ == "__main__":
    preprocess_data()


Dataset Preview
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   N

['Age', 'Cabin', 'Embarked']
