# Automatidata - Fixed with Cross-Validation and Data Leakage Prevention

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")


## 1. Create Sample DataFrame (Mocked Example Similar to Notebook)

In [2]:

data = {
    'feature_numeric_1': [1.0, 2.1, 3.2, np.nan, 4.5, 5.1, 6.7, 7.3, 8.1, 9.0],
    'feature_numeric_2': [5.1, 3.3, 4.4, 2.2, 6.6, 7.7, np.nan, 8.8, 9.9, 1.0],
    'feature_categorical': ['A', 'B', 'B', 'C', 'A', 'C', 'C', np.nan, 'A', 'B'],
    'target': [10.0, 15.2, 14.3, 12.5, 18.9, 22.1, 21.0, 24.4, 25.0, 23.9]
}
df = pd.DataFrame(data)
df.head()


Unnamed: 0,feature_numeric_1,feature_numeric_2,feature_categorical,target
0,1.0,5.1,A,10.0
1,2.1,3.3,B,15.2
2,3.2,4.4,B,14.3
3,,2.2,C,12.5
4,4.5,6.6,A,18.9


## 2. Split Off Test Set Early (To Avoid Data Leakage)

In [3]:

train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_val_df.shape, test_df.shape


((8, 4), (2, 4))

## 3. Define Preprocessing Pipelines

In [4]:

numeric_features = ['feature_numeric_1', 'feature_numeric_2']
categorical_features = ['feature_categorical']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


## 4. Cross-Validation Training with Linear Regression

In [5]:

X = train_val_df.drop(columns=['target'])
y = train_val_df['target']

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    mse_scores.append(mse)
    print(f"Fold {fold+1} MSE: {mse:.2f}")

print("\nAll MSE Scores:", mse_scores)
print("Average MSE:", np.mean(mse_scores))


Fold 1 MSE: 0.74
Fold 2 MSE: 29.06
Fold 3 MSE: 12.69
Fold 4 MSE: 33.34
Fold 5 MSE: 20.56

All MSE Scores: [0.7423173375190871, 29.05799520494115, 12.686314337699164, 33.339929843936716, 20.557920096455785]
Average MSE: 19.27689536411038


In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Create sample data
X, y = make_regression(n_samples=100, n_features=10, noise=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = LinearRegression()

# Set up grid search (just for demonstration — not many parameters here)
param_grid = {
    'fit_intercept': [True, False],
    'normalize': [True, False]  # Note: deprecated in newer sklearn versions
}

# Apply GridSearchCV
grid = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid.best_params_)
print("Best CV Score:", -grid.best_score_)


ValueError: Invalid parameter 'normalize' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].