In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('your_dataset.csv')  # Replace with your actual dataset

# Display initial information
print("Initial Data Information:")
print(df.info())
print("\nMissing Values:\n", df.isnull().sum())

# Separate features and target variable
X = df.drop('target_column', axis=1)  # Replace 'target_column' with your target variable
y = df['target_column']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Define preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Create preprocessing and training pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
    # Add your model here, e.g., ('classifier', LogisticRegression())
])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
model_pipeline.fit(X_train, y_train)

# Transform the test data
X_test_transformed = model_pipeline.transform(X_test)

# Save the preprocessed data if needed
# pd.DataFrame(X_test_transformed).to_csv('preprocessed_test_data.csv', index=False)