<a href="https://colab.research.google.com/github/Ironsoldier353/ML_project_01/blob/main/Predict_sale_price_optimized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the libraries

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

Loading the dataset

In [53]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

Preprocessing the data

In [54]:
# Separate features and target from training data
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

# Define your feature columns based on the dataset
numeric_features = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
                    'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
                    'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
                    'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'WoodDeckSF', 'OpenPorchSF',
                    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
                    'MoSold', 'YrSold', 'GarageCars', 'GarageArea']

categorical_features = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
                        'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
                        'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
                        'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
                        'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                        'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
                        'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
                        'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature',
                        'SaleType', 'SaleCondition']

# Ensure test_data has the same columns as X
test_data = test_data[numeric_features + categorical_features]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_features)
    ]
)

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


Train the model

In [55]:
# Define Gradient Boosting model pipeline
gbm_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

# Reduce hyperparameter search space
param_distributions = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__max_depth': [3, 5],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

# Hyperparameter tuning with fewer iterations and folds
random_search = RandomizedSearchCV(estimator=gbm_model, param_distributions=param_distributions,
                                    n_iter=5, cv=3, scoring='r2', n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found for Gradient Boosting: ", random_search.best_params_)
print("Best R² score found for Gradient Boosting: ", random_search.best_score_)

# Evaluate on validation data
y_pred_gbm = random_search.best_estimator_.predict(X_valid)
rmse_gbm = np.sqrt(mean_squared_error(y_valid, y_pred_gbm))
r2_gbm = r2_score(y_valid, y_pred_gbm)

print(f'Gradient Boosting RMSE: {rmse_gbm:.2f}')
print(f'Gradient Boosting R²: {r2_gbm:.4f}')

# Transform the test data using the fitted preprocessor
test_features_transformed = random_search.best_estimator_.named_steps['preprocessor'].transform(test_data)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters found for Gradient Boosting:  {'regressor__n_estimators': 200, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_depth': 3, 'regressor__learning_rate': 0.1}
Best R² score found for Gradient Boosting:  0.8743079817058524
Gradient Boosting RMSE: 26143.37
Gradient Boosting R²: 0.9109


Predict the SalePrice

In [56]:

# Predict using the best model
y_test_pred = random_search.best_estimator_.named_steps['regressor'].predict(test_features_transformed)

# Create a DataFrame for predictions
if 'Id' in test_data.columns:
    predictions = pd.DataFrame({
        'Id': test_data['Id'],
        'SalePrice': y_test_pred
    })
else:
    predictions = pd.DataFrame({
        'SalePrice': y_test_pred
    })

# Save predictions to a CSV file
predictions.to_csv('predictions1.csv', index=False)
