## **Exercise 1** 

Create functions that implement R^2, RMSE, and MAE in python.

**Solution**

In [34]:
# Sample data
y_true = np.array([3, -0.5, 2, 7])
y_pred = np.array([2.5, 0.0, 2, 8])


import numpy as np

def r2_score(y_true, y_pred):
    ssr = np.sum((y_true - y_pred) ** 2)
    sst = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ssr / sst)

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))


# RESULTS SHOULD BE
# R² Score: 0.9486
# rmse Score: 0.6124
# MAE: 0.5000


print(f"R² Score: {r2_score(y_true, y_pred):.4f}")
print(f"rmse Score: {rmse(y_true, y_pred):.4f}")
print(f"MAE: {mae(y_true, y_pred):.4f}")


R² Score: 0.9486
rmse Score: 0.6124
MAE: 0.5000


## **Exercise 2**

In this exercise, you will work with a dataset containing information about houses.  
Your task is to predict the house price based on various features, including some that require preprocessing. You will try different combinations of pre-processing steps and compare the results.

Dataset features:
1. 'area': House area in square feet (numeric)
2. 'bedrooms': Number of bedrooms (numeric)
3. 'age': Age of the house in years (numeric)
4. 'neighborhood': Categorical feature with missing values (categorical)
5. 'distance_to_city_center': Distance to city center in miles (exponential feature)

Target variable:
- 'price': House price in thousands of dollars

Tasks:
1. Load and explore the dataset
2. Preprocess the data:
   - Handle missing values in 'neighborhood'
   - Encode 'neighborhood' using both OrdinalEncoder and OneHotEncoder
   - Create a log-scaled version of 'distance_to_city_center'
3. Create four different analyses
   - No log scaling, OrdinalEncoder
   - No log scaling, OneHotEncoder
   - With log scaling, OrdinalEncoder
   - With log scaling, OneHotEncoder
4. Use a `CrossValidator` and pass in a custom scorer (using `make_scorer`) that uses the routines defined in exercise 1.
5. Compare the results.  Which performs best?

In [14]:
import numpy as np
import pandas as pd

# Generate synthetic data
np.random.seed(42)
n_samples = 1000

area = np.random.uniform(1000, 5000, n_samples)
bedrooms = np.random.randint(1, 6, n_samples)
age = np.random.uniform(1, 50, n_samples)
neighborhood = np.random.choice(['A', 'B', 'C', 'D', 'E'], n_samples)
distance_to_city_center = np.random.exponential(scale=5, size=n_samples)

# Introduce missing values in neighborhood
neighborhood[np.random.choice(n_samples, 100, replace=False)] = np.nan

# Create target variable with log-scaled distance and neighborhood effect
price = (
    10 * np.log(area) +
    5 * bedrooms -
    2 * age +
    np.where(neighborhood == 'A', 50, 0) +
    np.where(neighborhood == 'C', 30, 0) +
    np.where(neighborhood == 'E', 10, 0) +
    np.where(neighborhood == 'D', -20, 0) +
    np.where(neighborhood == 'B', -40, 0) -
    20 * np.log(distance_to_city_center + 1) +
    np.random.normal(0, 10, n_samples)
)

# Create DataFrame
df = pd.DataFrame({
    'area': area,
    'bedrooms': bedrooms,
    'age': age,
    'neighborhood': neighborhood,
    'distance_to_city_center': distance_to_city_center,
    'price_10K': price
})


In [None]:
df

**Solution**

In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer



X = df.drop('price_10K', axis=1)
y = df['price_10K']
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# Define preprocessing pipelines
def test(X,y, encoder='ordinal'):
    numeric_features = ['area', 'bedrooms', 'age','distance_to_city_center']
    
    numeric_transformer = SimpleImputer(strategy='median')
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OrdinalEncoder() if encoder == 'ordinal' else OneHotEncoder(drop='first', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, ['neighborhood'])
        ])
    
    scoring = {
        'R2': make_scorer(lambda y, y_pred: r2_score(y, y_pred)),
        'RMSE': make_scorer(lambda y, y_pred: rmse(y, y_pred)),
        'MAE': make_scorer(lambda y, y_pred: mae(y, y_pred))
    }
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

    scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring)
    for key, values in scores.items():
        print(f"{key}: {values.mean():.3f} (+/- {values.std() * 2:.3f})")
    

# Create and evaluate models
results = {}

for log_scale in [False, True]:
    for encoder in ['ordinal', 'onehot']:
        # Prepare the data
        X_train = X.copy()
        
        if log_scale:
            X_train['distance_to_city_center'] = np.log1p(X_train[['distance_to_city_center']])
            
        # Create and fit the model
        print(f"{'Log' if log_scale else 'No Log'} - {encoder.capitalize()}")
        test(X_train,y,encoder)
        