In [None]:
# -----------------------------
#   IMPORT LIBRARIES
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
import os
import random

# -----------------------------
#  RANDOM SEED
# -----------------------------
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# -----------------------------
#  LOAD DATASET
# -----------------------------
df = pd.read_csv("sample_dataset.csv")

print("Dataset Loaded!")
print(df.head())

# -----------------------------
# Identify numeric and categorical columns
# -----------------------------
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)

# -----------------------------
# Numeric Pipeline
# -----------------------------
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# -----------------------------
# Categorical Pipeline
# -----------------------------
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# -----------------------------
# Column Transformer
# -----------------------------
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# -----------------------------
# Full Pipeline
# -----------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# -----------------------------
# Apply Pipeline
# -----------------------------
cleaned_data = pipeline.fit_transform(df)

# -----------------------------
# Convert to DataFrame
# -----------------------------
if hasattr(cleaned_data, "toarray"):
    cleaned_df = pd.DataFrame(cleaned_data.toarray())
else:
    cleaned_df = pd.DataFrame(cleaned_data)

# -----------------------------
# Export Cleaned Dataset
# -----------------------------
os.makedirs("data", exist_ok=True)
cleaned_df.to_csv("data/cleaned_dataset.csv", index=False)

# -----------------------------
# Save Pipeline
# -----------------------------
joblib.dump(pipeline, "preprocessing_pipeline.pkl")

print("Cleaned dataset saved to data/cleaned_dataset.csv")
print("Pipeline saved to preprocessing_pipeline.pkl")
print("Model by Mahnoor khan swati")


Dataset Loaded!
   Age  Gender    Fare Embarked  Survived
0   52    male  197.17        C         0
1   15    male  164.09        Q         1
2   72  female   30.19        C         1
3   61    male   48.79        S         0
4   21  female  225.65        S         0
Numeric Columns: ['Age', 'Fare', 'Survived']
Categorical Columns: ['Gender', 'Embarked']
Cleaned dataset saved to data/cleaned_dataset.csv
Pipeline saved to preprocessing_pipeline.pkl
Model by Mahnoor khan swati
