# Data Preprocessing Regression

## Import libraries

In [1]:
from sklearn.datasets import load_diabetes, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

## Load the dataset

In [2]:
# User selects the dataset
# Options: "diabetes", "california_housing"
selected_dataset = "diabetes"  # Change this variable to choose a dataset

# Load the selected dataset
if selected_dataset == "diabetes":
    data = load_diabetes(as_frame=True)
elif selected_dataset == "california_housing":
    data = fetch_california_housing(as_frame=True)
else:
    raise ValueError("Invalid dataset selected. Choose 'diabetes' or 'california_housing'.")

X = data.data
y = data.target

## Handle missing values

In [None]:
if X.isnull().any().any():
    print("Found missing values. Imputing with mean...")
    X.fillna(X.mean(), inplace=True)
else:
    print("No missing values found.")

## Data overview

In [None]:
print("Dataset Overview:")
display(X.head())
print("_"*120)
print("\nDataset Info:")
print(X.info())
print("_"*120)
print(f"\nNumber of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")

## Correlation heatmap

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

## Standardize dataset

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Save data

In [None]:
preprocessed_data = {
    "X_scaled": X_scaled,
    "X_raw": X,
    "y": y
}

with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump(preprocessed_data, f)

print("Preprocessed data saved.")