# Data Preprocessing and AI Model Selection

This notebook demonstrates preprocessing and model selection on two datasets:
- **Regression**: California Housing dataset with introduced missing values
- **Classification**: Titanic dataset

Key tasks covered:
- Data Cleaning
- Handling Missing Values
- Encoding Categorical Variables
- Feature Selection
- Model Selection and Evaluation
- Hyperparameter Tuning using `GridSearchCV` and `RandomizedSearchCV`


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

## Regression Task: California Housing Dataset with Missing Values

In [2]:
from sklearn.utils import shuffle

data = fetch_california_housing(as_frame=True)
df_california = data.frame
df_california = shuffle(df_california, random_state=42)

# Introduce missing values (10% in each column)
for col in df_california.columns:
    df_california.loc[df_california.sample(frac=0.1, random_state=42).index, col] = np.nan

print(df_california.isnull().sum())

MedInc         2064
HouseAge       2064
AveRooms       2064
AveBedrms      2064
Population     2064
AveOccup       2064
Latitude       2064
Longitude      2064
MedHouseVal    2064
dtype: int64


In [3]:
X = df_california.drop("MedHouseVal", axis=1)
y = df_california["MedHouseVal"]

imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
}
rf = RandomForestRegressor(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train_scaled, y_train)

print("Best Params:", grid.best_params_)
print("Best R2 Score:", grid.best_score_)

Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best R2 Score: 0.8038929029979307


In [5]:
models = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'RandomForest': RandomForestRegressor(**grid.best_params_)
}

for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    print(f"{name} R2 Avg: {scores.mean():.4f}")

LinearRegression R2 Avg: 0.6023
SVR R2 Avg: 0.7292
RandomForest R2 Avg: 0.8028


## Classification Task: Titanic Dataset

In [6]:
df_titanic = sns.load_dataset('titanic')
df_titanic.drop(['deck', 'embark_town', 'alive', 'who', 'class'], axis=1, inplace=True)
df_titanic['age'].fillna(df_titanic['age'].median(), inplace=True)
df_titanic['embarked'].fillna(df_titanic['embarked'].mode()[0], inplace=True)
df_titanic.dropna(inplace=True)

df_titanic = pd.get_dummies(df_titanic, columns=['sex', 'embarked'], drop_first=True)
X = df_titanic.drop('survived', axis=1)
y = df_titanic['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}
lr = LogisticRegression(max_iter=1000)
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy')
grid_lr.fit(X_train_scaled, y_train)

print("Best Params (LR):", grid_lr.best_params_)
print("Best CV Accuracy (LR):", grid_lr.best_score_)

Best Params (LR): {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV Accuracy (LR): 0.8201910765291046


In [8]:
param_dist_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 4, 6],
    'bootstrap': [True, False]
}
rf = RandomForestClassifier()
rand_rf = RandomizedSearchCV(rf, param_distributions=param_dist_rf, n_iter=10, scoring='accuracy', cv=5, random_state=42)
rand_rf.fit(X_train_scaled, y_train)

print("Best Params (RF):", rand_rf.best_params_)
print("Best CV Accuracy (RF):", rand_rf.best_score_)

Best Params (RF): {'n_estimators': 150, 'min_samples_split': 4, 'max_depth': 5, 'bootstrap': True}
Best CV Accuracy (RF): 0.8328277356446371


In [9]:
models = {
    'BestLogistic': grid_lr.best_estimator_,
    'BestRandomForest': rand_rf.best_estimator_,
    'SVC': SVC()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name} Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


BestLogistic Evaluation:
Accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.81       179
weighted avg       0.81      0.82      0.81       179


BestRandomForest Evaluation:
Accuracy: 0.8100558659217877
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.72      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


SVC Evaluation:
Accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.72      0.76        74

    accuracy          