In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [65]:
data = pd.read_csv('train.csv')
# Display basic info
print("Dataset info:")
print(data.info())
print("\nMissing values:\n", data.isnull().sum())

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  

In [66]:
# Handle missing values , Impute missing numeric values with the mean
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
numeric_imputer = SimpleImputer(strategy='mean')
data[numeric_features] = numeric_imputer.fit_transform(data[numeric_features])

In [67]:
# Selecting the wanted features from the task (squared footage,number of rooms and bathrooms) and target (price)
features = ['GrLivArea', 'BedroomAbvGr', 'FullBath', 'HalfBath']
target = 'SalePrice'

# Dropping rows with missing values for simplicity
df_selected = data[features + [target]]
# Replace 'SalePrice' with your target column
X = df_selected[features]
y = df_selected[target]

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData preprocessing complete.")
print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Data preprocessing complete.
Training features shape: (1168, 4)
Test features shape: (292, 4)


In [68]:
df_selected.head()

Unnamed: 0,GrLivArea,BedroomAbvGr,FullBath,HalfBath,SalePrice
0,1710.0,3.0,2.0,1.0,208500.0
1,1262.0,3.0,2.0,0.0,181500.0
2,1786.0,3.0,2.0,1.0,223500.0
3,1717.0,3.0,1.0,0.0,140000.0
4,2198.0,4.0,2.0,1.0,250000.0


In [69]:
# Create a pipeline with PolynomialFeatures and Linear Regression
linear_pipeline = Pipeline([
    ("scaler", StandardScaler()),  # Feature scaling
    ("poly_features", PolynomialFeatures(degree=2, include_bias=False)),  # Polynomial features
    ("linear_reg", LinearRegression())  # Linear Regression model
])

param_grid = {
    "poly_features__degree": [1, 2] 
}

# Perform GridSearchCV with reduced grid space
grid_search = GridSearchCV(
    estimator=linear_pipeline,
    param_grid=param_grid,
    cv=5, 
    scoring="r2",  
    n_jobs=-1 
)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)


best_degree = grid_search.best_params_['poly_features__degree']
print(f"Best polynomial degree: {best_degree}")
best_model = grid_search.best_estimator_


Best polynomial degree: 2


In [71]:
# Evaluate the best model on the test set
y_pred_test = best_model.predict(X_test)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_test)
print("Model Evaluation with Linear Regression:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared Score (R² / Accuracy): {r2:.2f}")

# Step 7: Perform cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring="r2")
print("\nCross-Validation Results with Linear Regression:")
print(f"Cross-Validation R² Scores: {cv_scores}")
print(f"Mean Cross-Validation R²: {cv_scores.mean():.2f}")
print(f"Standard Deviation of Cross-Validation R²: {cv_scores.std():.2f}")

Model Evaluation with Linear Regression:
Mean Absolute Error (MAE): 32630.35
Mean Squared Error (MSE): 2376976192.17
Root Mean Squared Error (RMSE): 48754.24
R-squared Score (R² / Accuracy): 0.69

Cross-Validation Results with Linear Regression:
Cross-Validation R² Scores: [0.67496561 0.35399526 0.65182883 0.52018076 0.72093074]
Mean Cross-Validation R²: 0.58
Standard Deviation of Cross-Validation R²: 0.13


In [72]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [73]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [74]:
linear_pipeline = Pipeline([
    ("scaler", StandardScaler()), 
    ("poly_features", PolynomialFeatures(degree=2, include_bias=False)), 
    ("linear_reg", LinearRegression())  
])

param_grid = {
    "poly_features__degree": [1, 2]  
}

# Perform GridSearchCV with reduced grid space
grid_search = GridSearchCV(
    estimator=linear_pipeline,
    param_grid=param_grid,
    cv=5, 
    scoring="r2", 
    n_jobs=-1  
)

# Fit the model using GridSearchCV
grid_search.fit(X_train_scaled, y_train_log)
best_degree = grid_search.best_params_['poly_features__degree']
print(f"Best polynomial degree: {best_degree}")
best_model = grid_search.best_estimator_


Best polynomial degree: 2


In [75]:
# Evaluate the best model on the test set
y_pred_test = best_model.predict(X_test_scaled)

# Evaluation metrics
mae = mean_absolute_error(y_test_log, y_pred_test)
mse = mean_squared_error(y_test_log, y_pred_test)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_log, y_pred_test)
print("Model Evaluation with Linear Regression:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared Score (R² / Accuracy): {r2:.2f}")

# Perform cross-validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train_log, cv=5, scoring="r2")
print("\nCross-Validation Results with Linear Regression:")
print(f"Cross-Validation R² Scores: {cv_scores}")
print(f"Mean Cross-Validation R²: {cv_scores.mean():.2f}")
print(f"Standard Deviation of Cross-Validation R²: {cv_scores.std():.2f}")

Model Evaluation with Linear Regression:
Mean Absolute Error (MAE): 0.19
Mean Squared Error (MSE): 0.06
Root Mean Squared Error (RMSE): 0.25
R-squared Score (R² / Accuracy): 0.67

Cross-Validation Results with Linear Regression:
Cross-Validation R² Scores: [0.66406733 0.51464834 0.62947166 0.50925315 0.6798895 ]
Mean Cross-Validation R²: 0.60
Standard Deviation of Cross-Validation R²: 0.07


Here's how to compare the two results:

### Model 1(without scaling and log):
- **MAE**: 32,630.35
- **MSE**: 2,376,976,192.17
- **RMSE**: 48,754.24
- **R² Score**: 0.69
- **Cross-Validation R² Mean**: 0.58 (with a standard deviation of 0.13)

### Model 2(with scaling and log):
- **MAE**: 0.19
- **MSE**: 0.06
- **RMSE**: 0.25
- **R² Score**: 0.67
- **Cross-Validation R² Mean**: 0.60 (with a standard deviation of 0.07)

---

### Analysis:

1. **MAE (Mean Absolute Error)**:
   - **Model 1** has a much higher MAE (32,630.35) compared to **Model 2** (0.19), meaning that Model 2's predictions are closer to the actual house prices.

2. **MSE (Mean Squared Error)** and **RMSE**:
   - Both metrics are much higher in **Model 1** (MSE: 2.37 billion, RMSE: 48,754.24) compared to **Model 2** (MSE: 0.06, RMSE: 0.25), indicating that Model 1's predictions have more significant errors.

3. **R² Score (Accuracy)**:
   - **Model 1** has a higher R² score (0.69), indicating that it explains more variance in the target variable (house prices) compared to **Model 2** (0.67).
   - However, **Model 2** has more consistent performance as shown by the lower standard deviation in its cross-validation results (0.07 vs. 0.13 in Model 1).

4. **Cross-Validation Results**:
   - The mean R² for **Model 1** is 0.58, which is lower than that of **Model 2** (0.60). The lower standard deviation in **Model 2** suggests more stable performance across different folds.

---

### Conclusion:
- **Model 2** seems to perform better in terms of accuracy (lower MAE, MSE, and RMSE), with more stable and consistent results across different folds, even though its R² score is slightly lower than **Model 1**.
- **Model 1** might have a higher R², but its evaluation metrics (MAE, MSE, RMSE) indicate that the predictions are less accurate.

Therefore, **Model 2** would be the better model to use in this case, given the better overall performance and stability in cross-validation.