# Import_Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Loading The Data:

In [2]:
df = pd.read_csv(r"C:\Users\Hp\Desktop\Data_Science_Jupyter\Project_1_digi_crome_capstone\Data\features_Property_data.csv")

In [3]:
df.head(5)

Unnamed: 0,PropertyID,PropertyClass,PropertyFrontage,PropertySize,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExteriorCladdingArea,BsmntFinSty1,...,SaleCondn_Alloca,SaleCondn_Family,SaleCondn_Normal,SaleCondn_Partial,AgeAtSale,YearsSinceRemodel,TotalSqFootage,TotalBathrooms,QualityScore,GrLivArea_OverallQual
0,1,60,65,8450,7,5,2003,2003,196,706,...,0,0,1,0,5,5,1860,3.5,35,11970
1,2,20,80,9600,6,8,1976,1976,0,978,...,0,0,1,0,31,31,1546,2.5,48,7572
2,3,60,68,11250,7,5,2001,2002,162,486,...,0,0,1,0,7,6,2220,3.5,35,12502
3,4,70,60,9550,7,5,1915,1970,0,216,...,0,0,0,0,91,36,2257,2.0,35,12019
4,5,60,84,14260,8,5,2000,2000,350,655,...,0,0,1,0,8,8,2688,3.5,40,17584


In [4]:
df.shape

(1430, 268)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1430 entries, 0 to 1429
Columns: 268 entries, PropertyID to GrLivArea_OverallQual
dtypes: float64(1), int64(267)
memory usage: 2.9 MB


# Model Selection

In [6]:
X=df.drop('PropPrice',axis=1)
Y=df [['PropPrice']]

# Standardize the Features

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the Data

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

# Train and Evaluate Models

# Linear Regression

In [9]:
lr_model = LinearRegression()
lr_model.fit(X_train, Y_train)
lr_y_pred = lr_model.predict(X_test)

In [10]:
score = r2_score(Y_test, lr_y_pred)
print(f'Model r2 score: {score}')

Model r2 score: -4.179739572161077e+19


# Decision Tree

In [11]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, Y_train)
dt_y_pred = dt_model.predict(X_test)

In [12]:
score = r2_score(Y_test, dt_y_pred)
print(f'Model r2 score: {score}')

Model r2 score: 0.6572785770113834


# Random Forest

In [13]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, Y_train)
rf_y_pred = rf_model.predict(X_test)

In [14]:
score = r2_score(Y_test, rf_y_pred)
print(f'Model r2 score: {score}')

Model r2 score: 0.8721066075084646


# Gradient Boosting Regressor Model

In [15]:
gr_model = GradientBoostingRegressor()
gr_model.fit(X_train, Y_train)
gb_y_pred = gr_model.predict(X_test)

In [16]:
score = r2_score(Y_test, gb_y_pred)
print(f'Model r2 score: {score}')

Model r2 score: 0.8843370237327433


# Split the Data

In [17]:
x_train,x_test,y_train,y_test=train_test_split(X_scaled,Y,test_size=0.2,random_state=42)

# Apply PCA:

In [18]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95) 
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [19]:
x_train_pca.shape

(1144, 160)

In [20]:
x_test_pca.shape

(286, 160)

# After PCA Linear Regression

In [21]:
pca_lr_model = LinearRegression()
pca_lr_model.fit(x_train_pca, y_train)
lr_y_pred1 = pca_lr_model.predict(x_test_pca)

In [22]:
score = r2_score(y_test, lr_y_pred1)
print(f'Model r2 score: {score}')

Model r2 score: 0.8409415040067144


# After PCA Decision Tree

In [23]:
pca_dt_model = DecisionTreeRegressor()
pca_dt_model.fit(x_train_pca, y_train)
dt_y_pred2 = pca_dt_model.predict(x_test_pca)

In [24]:
score = r2_score(y_test, dt_y_pred2)
print(f'Model r2 score: {score}')

Model r2 score: 0.7671915687290087


# After PCA Random Forest

In [25]:
pca_rf_model = RandomForestRegressor()
pca_rf_model.fit(x_train_pca, y_train)
rf_y_pred3 = pca_rf_model.predict(x_test_pca)

In [26]:
score = r2_score(y_test, rf_y_pred3)
print(f'Model r2 score: {score}')

Model r2 score: 0.8657316603634387


# After PCA Gradient Boosting Regressor 

In [27]:
pca_gr_model = GradientBoostingRegressor()
pca_gr_model.fit(x_train_pca, y_train)
gb_y_pred4 = pca_gr_model.predict(x_test_pca)

In [28]:
score = r2_score(y_test, gb_y_pred4)
print(f'Model r2 score: {score}')

Model r2 score: 0.8912182275690795


# Evaluate each model using MAE and RMSE.

In [29]:
#Linear Regression Metrics
mae_linear = mean_absolute_error(y_test, lr_y_pred1)
rmse_linear = np.sqrt(mean_squared_error(y_test, lr_y_pred1))
print(f"Linear Regression - MAE: {mae_linear}, RMSE: {rmse_linear}")

Linear Regression - MAE: 17798.238572851013, RMSE: 26598.745227413445


In [30]:
#Decision Tree
mae_dt = mean_absolute_error(y_test, dt_y_pred2)
rmse_dt = np.sqrt(mean_squared_error(y_test, dt_y_pred2))
print(f"Linear Regression - MAE: {mae_dt}, RMSE: {rmse_dt}")

Linear Regression - MAE: 22248.81118881119, RMSE: 32179.705562852625


In [31]:
#Random Forest Metrics
mae_forest = mean_absolute_error(y_test, rf_y_pred3)
rmse_forest = np.sqrt(mean_squared_error(y_test, rf_y_pred3))
print(f"Random Forest - MAE: {mae_forest}, RMSE: {rmse_forest}")

Random Forest - MAE: 16355.593356643354, RMSE: 24438.217352779084


In [32]:
#Gradient Boosting Regressor Metrics
mae_gradient = mean_absolute_error(y_test, gb_y_pred4)
rmse_gradient = np.sqrt(mean_squared_error(y_test, gb_y_pred4))
print(f"Random Forest - MAE: {mae_gradient}, RMSE: {rmse_gradient}")

Random Forest - MAE: 14656.658493609875, RMSE: 21996.864481203967


# Compare and Select the Best Model

In [33]:
# Print the metrics for comparison

print("Model Evaluation Results:")
print(f"Linear Regression - MAE: {mae_linear:.4f}, RMSE: {rmse_linear:.4f}")
print(f"Decision Tree - MAE: {mae_dt:.4f}, RMSE: {rmse_dt:.4f}")
print(f"Random Forest - MAE: {mae_forest:.4f}, RMSE: {rmse_forest:.4f}")
print(f"Gradient Boosting - MAE: {mae_gradient:.4f}, RMSE: {rmse_gradient:.4f}")


# Determine the best model
models = {
    "Linear Regression": {"MAE": mae_linear, "RMSE": rmse_linear},
    "Decision Tree": {"MAE": mae_dt, "RMSE": rmse_dt},
    "Random Forest": {"MAE": mae_forest, "RMSE": rmse_forest},
    "Gradient Boosting": {"MAE": mae_gradient, "RMSE": rmse_gradient},
}

best_model = min(models, key=lambda x: models[x]["RMSE"])
print(f"The best model is: {best_model}")

Model Evaluation Results:
Linear Regression - MAE: 17798.2386, RMSE: 26598.7452
Decision Tree - MAE: 22248.8112, RMSE: 32179.7056
Random Forest - MAE: 16355.5934, RMSE: 24438.2174
Gradient Boosting - MAE: 14656.6585, RMSE: 21996.8645
The best model is: Gradient Boosting


# Conclusion

This process allows you to:

1. Load and prepare your data.
2. Split your data into training and testing sets.
3. Train different machine learning models.
4. Evaluate their performance using mean squared error (MSE) and R2 score.
5. Use cross-validation for more reliable performance estimates.
6. Visualize the results to compare actual vs. predicted property prices.

In [34]:
import pickle 
pickle.dump(gb_y_pred4,open('Gradient_Boosting.pkl','wb'))
print(type(gb_y_pred4)) 

<class 'numpy.ndarray'>
