# Data Cleaning
Cleaning missing values and preparing data for modeling.


In [1]:
import pandas as pd
import streamlit as st


## Load Raw Data
Checking raw data before cleaning.


In [4]:
st.title("Data Cleaning")

# Load dataset
data = pd.read_csv("../data/train.csv")
st.write("Raw Data Sample", data.head())




## Handling Missing Values
Filling numerical columns with mean and categorical columns with mode.


In [5]:
# Identify numerical and categorical columns
numeric_cols = data.select_dtypes(include=['number']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Fill numeric columns with mean
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Fill categorical columns with mode
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

st.write("Cleaned Data Sample", data.head())




## Save Cleaned Data
The cleaned dataset is saved for further processing.

In [6]:
# Save cleaned dataset
data.to_csv("../data/final_cleaned_train.csv", index=False)
st.write("Data cleaning completed and saved!")


