In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!python --version

Python 3.10.12


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

In [None]:
df = pd.read_csv("/kaggle/input/house-prices/train.csv")
test_df = pd.read_csv("/kaggle/input/house-prices/test.csv")

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

X_train.drop(columns=["SalePrice"], inplace=True)
X_test.drop(columns=["SalePrice"], inplace=True)

X_train_saved = X_train.copy()

# Column Dropper, ModeImputer and Scaler classes for feature engineering

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class ModeImputer(BaseEstimator, TransformerMixin):

    
    def fit(self, X, y=None):
        X_temp = X.copy()
        self.modes = X_temp.mode().iloc[0]
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for col in X_transformed.columns:
            X_transformed[col] = X_transformed[col].fillna(self.modes[col])
        return X_transformed

In [None]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)

In [None]:
from sklearn.preprocessing import StandardScaler

class Scaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_scale):
        self.columns_to_scale = columns_to_scale
    
    def fit(self, X, y=None):
        X_cp = X.copy()
        self.scaler = StandardScaler()
        self.scaler.fit(X_cp[self.columns_to_scale])
        return self
    
    def transform(self, X):
        X_cp = X.copy()
        X_cp[self.columns_to_scale] = self.scaler.transform(X_cp[self.columns_to_scale])
        return X_cp

In [None]:
numeric_feats = X_train.select_dtypes(include=[np.number])

skewed_feats = numeric_feats.apply(lambda x: x.skew()).sort_values(ascending=False)

skewed_feats_cols = skewed_feats[skewed_feats > 1].index
# X_train[skewed_feats_cols] = np.log1p(X_train[skewed_feats_cols])

# X_train

In [None]:
from sklearn.preprocessing import StandardScaler

class SkewedFeatsHandler(BaseEstimator, TransformerMixin):
    def __init__(self, skewed_cols):
        self.skewed_cols = skewed_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_cp = X.copy()
        X_cp[self.skewed_cols] = np.log1p(X_cp[self.skewed_cols])
        return X_cp

In [None]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
cat_cols

In [None]:
cnts = X_train[cat_cols].nunique()
cnts

In [None]:
threshold = 3

target_encoded_cols = list(cnts[cnts > 3].index)
one_hot_cols = list(cnts[cnts <= 3].index)
target_encoded_cols

In [None]:
one_hot_cols

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

class CategoricalPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, target_encoded_cols, one_hot_columns):
        self.target_encoded_cols = target_encoded_cols
        self.one_hot_columns = one_hot_columns 

    def fit(self, X, y):
        
        df_woe = X.copy()
        target_col = 'SalePrice'
        df_woe[target_col] = y

        woe_mappings = {}
        iv_values = {}

        self.encodings = {}
        global_mean = y.mean()
        
        for col in self.target_encoded_cols:
            # print(f"Processing {col}...")
            encoding_dict = df_woe.groupby(col)[target_col].mean().to_dict()
            # print(encoding_dict)
            
            encoding_dict['_default_'] = y.mean()
            
            self.encodings[col] = encoding_dict

        encoder = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), self.one_hot_columns)
            ],
            remainder='passthrough',
            verbose_feature_names_out=False
        )

        encoder.fit(X)
        
        self.encoder = encoder

        return self


    def transform(self, X):
        X_transformed = X.copy()


        print("Preprocessing One Hot Columns")
        X_transformed_encoded = self.encoder.transform(X_transformed)
        encoded_feature_names = self.encoder.get_feature_names_out()
        X_transformed = pd.DataFrame(X_transformed_encoded, columns=encoded_feature_names)
        
       
        print("***")
        print("Preprocessing WOE Columns")
        for col in self.target_encoded_cols:
            X_transformed[col] = X_transformed[col].apply(
                lambda x: self.encodings[col].get(x, self.encodings[col]['_default_'])
            )
            # X_transformed.drop(columns=col, inplace=True)
            
        return X_transformed

# RFE

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,Ridge

pipeline = Pipeline([
    ('ColumnDropper', ColumnDropper(["Id"])),
    ('NAHandling', ModeImputer()),
    ('SkewedFeatsScaler', SkewedFeatsHandler(skewed_feats_cols)),
    ('HandlingCategoricalValues', CategoricalPreprocessor(target_encoded_cols, one_hot_cols)),
    ('Scaler', Scaler(target_encoded_cols)),
    ('Model', LinearRegression())
])

In [None]:
# model = LinearRegression().fit(X_train, y_train)
pipeline.fit(X_train, y_train)

In [None]:
# y_train_pred = model.predict(X_train)
y_train_pred = pipeline.predict(X_train)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

def evaluate(y_true, y_pred, dataset="Set"):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"{dataset} Performance:")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.2f}\n")


evaluate(y_train, y_train_pred, "Train")

In [None]:
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = np.sqrt(-cross_val_score(pipeline, X_train, y_train, cv=kf, 
                                     scoring='neg_mean_squared_error'))

print("RMSE for each fold:", rmse_scores)
print(f"Average RMSE: {np.mean(rmse_scores):.4f}")

In [None]:
y_test_pred = pipeline.predict(X_test)
evaluate(y_test, y_test_pred)

In [None]:
!pip install mlflow dagshub --quiet

In [None]:
import mlflow
import dagshub
dagshub.init(repo_owner='Luka-Surmanidze', repo_name='MachineLearning', mlflow=True)

In [None]:
experiment = "LiearRegression, second experiment for hw1"
run_name="Linear RegressionV2"

In [None]:
mlflow.set_experiment(experiment)

In [None]:

mlflow.end_run()
mlflow.start_run(run_name=run_name)

mlflow.log_param("model_type", "LinearRegression")
mlflow.log_param("random_state", 42)
mlflow.log_param("skewed_features_count", len(skewed_feats_cols))
mlflow.log_param("one_hot_threshold", 3)
mlflow.log_param("encoding_method", "OneHotEncoder/TargetEncoding")


pipeline = Pipeline([
    ('ColumnDropper', ColumnDropper(["Id"])),
    ('NAHandling', ModeImputer()),
    ('SkewedFeatsScaler', SkewedFeatsHandler(skewed_feats_cols)),
    ('HandlingCategoricalValues', CategoricalPreprocessor(target_encoded_cols, one_hot_cols)),
    ('Scaler', Scaler(target_encoded_cols)),
    ('Model', LinearRegression())
])

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)

y_test_pred = pipeline.predict(X_test)



data_sets = [[y_train, y_train_pred, "train"], [y_test, y_test_pred, "test"]]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = np.sqrt(-cross_val_score(pipeline, X_train, y_train, cv=kf, 
                                     scoring='neg_mean_squared_error'))

mlflow.log_metric("Average RMSE", float(np.mean(rmse_scores)))



for s in data_sets:
    y_true = s[0]
    y_pred = s[1]
    dataset = s[2]
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    mlflow.log_metrics({
        f'{dataset}_mae': mae,
        f'{dataset}_mape': mape,
        f'{dataset}_mse': mse,
        f'{dataset}_rmse': rmse,
        f'{dataset}_r2': r2
    })
    print(f"{dataset} Performance:")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.2f}\n")


mlflow.sklearn.log_model(pipeline, "LinearRegressionV2")


mlflow.end_run()

# Experiment 3

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,Ridge

pipeline = Pipeline([
    ('ColumnDropper', ColumnDropper(["Id"])),
    ('NAHandling', ModeImputer()),
    ('SkewedFeatsScaler', SkewedFeatsHandler(skewed_feats_cols)),
    ('HandlingCategoricalValues', CategoricalPreprocessor(target_encoded_cols, one_hot_cols)),
    ('Scaler', Scaler(target_encoded_cols)),
    ('Model', Ridge(alpha=1.0))
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'Model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0], 
    'RFE__n_features_to_select': [30, 40, 50, 60, 70]
}


grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1  
)


grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))


best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)


evaluate(y_train, y_train_pred, "Train")
evaluate(y_test, y_test_pred, "Test")

In [None]:
experiment = "Ridge, third experiment for hw1"
run_name="Ridge"

In [None]:
mlflow.set_experiment(experiment)

In [None]:

mlflow.end_run()
mlflow.start_run(run_name=run_name)

mlflow.log_param("model_type", "LinearRegression")
mlflow.log_param("random_state", 42)
mlflow.log_param("skewed_features_count", len(skewed_feats_cols))
mlflow.log_param("one_hot_threshold", 3)
mlflow.log_param("encoding_method", "OneHotEncoder/TargetEncoding")


data_sets = [[y_train, y_train_pred, "train"], [y_test, y_test_pred, "test"]]

mlflow.log_metric("best_RMSE_score", float(np.sqrt(-grid_search.best_score_)))



for s in data_sets:
    y_true = s[0]
    y_pred = s[1]
    dataset = s[2]
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    mlflow.log_metrics({
        f'{dataset}_mae': mae,
        f'{dataset}_mape': mape,
        f'{dataset}_mse': mse,
        f'{dataset}_rmse': rmse,
        f'{dataset}_r2': r2
    })
    print(f"{dataset} Performance:")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.2f}\n")


mlflow.sklearn.log_model(best_model, "Ridge")


mlflow.end_run()

# Ridge with StandardScaler and RFE

In [None]:
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator, TransformerMixin

class RFETransformer(BaseEstimator, TransformerMixin):
    def __init__(self, estimator=None, n_features_to_select=None, step=1):
        self.estimator = estimator
        self.n_features_to_select = n_features_to_select
        self.step = step
        self.selected_features_ = None
        self.rfe = None
        
    def fit(self, X, y=None):
        self.rfe = RFE(
            estimator=self.estimator,
            n_features_to_select=self.n_features_to_select,
            step=self.step
        )
        
    
        self.rfe.fit(X, y)
        
   
        if hasattr(X, 'columns'):
            self.selected_features_ = X.columns[self.rfe.support_].tolist()
        
        return self
    
    def transform(self, X):
      
        X_transformed = X.copy()
        
        if hasattr(X, 'columns'):
            
            return X_transformed[self.selected_features_]
        else:
            
            return self.rfe.transform(X_transformed)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,Ridge

pipeline = Pipeline([
    ('ColumnDropper', ColumnDropper(["Id"])),
    ('NAHandling', ModeImputer()),
    ('SkewedFeatsScaler', SkewedFeatsHandler(skewed_feats_cols)),
    ('HandlingCategoricalValues', CategoricalPreprocessor(target_encoded_cols, one_hot_cols)),
    ('Scaler', StandardScaler()), 
    ('RFE', RFETransformer(estimator=Ridge(alpha=100.0), n_features_to_select=50, step=1)),
    ('Model', Ridge(alpha=1.0))
])

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'Model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0],  
    'RFE__n_features_to_select': [30, 40, 50, 60, 70]
}


grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1  
)


grid_search.fit(X_train, y_train)


print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))


best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)


evaluate(y_train, y_train_pred, "Train")
evaluate(y_test, y_test_pred, "Test")

In [None]:
experiment = "Ridge, third experiment for hw1"
run_name="Ridge with RFE(model is Ridge(100)) and StandardScaler"
mlflow.set_experiment(experiment)

In [None]:

mlflow.end_run()
mlflow.start_run(run_name=run_name)

mlflow.log_param("model_type", "LinearRegression")
mlflow.log_param("random_state", 42)
mlflow.log_param("skewed_features_count", len(skewed_feats_cols))
mlflow.log_param("one_hot_threshold", 3)
mlflow.log_param("encoding_method", "OneHotEncoder/TargetEncoding")


data_sets = [[y_train, y_train_pred, "train"], [y_test, y_test_pred, "test"]]

mlflow.log_metric("best_RMSE_score", float(np.sqrt(-grid_search.best_score_)))



for s in data_sets:
    y_true = s[0]
    y_pred = s[1]
    dataset = s[2]
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mse ** 0.5
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    mlflow.log_metrics({
        f'{dataset}_mae': mae,
        f'{dataset}_mape': mape,
        f'{dataset}_mse': mse,
        f'{dataset}_rmse': rmse,
        f'{dataset}_r2': r2
    })
    print(f"{dataset} Performance:")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.2f}\n")


mlflow.sklearn.log_model(best_model, "Ridge")


mlflow.end_run()