In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
train = pd.read_csv("train.csv")

# Display information about the dataset, including the column names, data types, and missing values
train.info()

# Function to clean the data by dropping the 'Id' column and columns with more than 100 missing values
def clean_data(X):
    return (X
             .drop(columns='Id')
            .drop(columns=[col for col in X.columns if X[col].isnull().sum() > 100])
           )

# Clean the training data
train_cl = clean_data(train)

# Separate features (X) and target variable (y)
X = train_cl.drop(columns="SalePrice")
y = train_cl["SalePrice"]

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_cl.select_dtypes(include=['object']).columns   

# Define a transformer for numerical columns: impute missing values with the mean, then scale the values
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define a transformer for categorical columns: impute missing values with the most frequent value, then apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# Combine the numerical and categorical transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ], remainder="passthrough")

# Apply the preprocessor to the cleaned training data
preprocessor.fit_transform(train_cl)

# Redefine the features (X) and target variable (y) to ensure they are correctly separated
X = train_cl.drop(columns="SalePrice")
y = train_cl["SalePrice"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Define the models to be used
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Create a pipeline with the preprocessor and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Store the results
    results[name] = {'R²': r2, 'MAE': mae}

# Display the results
results
