In [47]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
#importing Libraries
import numpy as np
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [49]:
# Load the housing data
data_path = 'C:/Users/USER/Desktop/ML Materials/CognoRise Internship/Task Three/data.csv'
housing_data = pd.read_csv(data_path)

In [50]:
# Check for null values and data types
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [51]:
# Encode categorical features
label_encoder = LabelEncoder()
for col in housing_data.select_dtypes(include=['object']).columns:
    housing_data[col] = label_encoder.fit_transform(housing_data[col])

In [52]:
# Select numerical features and calculate correlation with price
corr_matrix = housing_data.corr()
top_features = corr_matrix.index[abs(corr_matrix['price']) > 0.4]
top_features = housing_data[top_features]

In [53]:
# Prepare the data for training
X = top_features.drop(columns=['price'])
y = top_features['price']
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [54]:
# Standardize the data
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
val_X = scaler.transform(val_X)

In [55]:
# Function to evaluate models
def evaluate_model(model, train_X, train_y, val_X, val_y):
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    mse = mean_squared_error(val_y, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(val_y, predictions)
    r2 = r2_score(val_y, predictions)
    return {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}

In [56]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=0),
    'Random Forest': RandomForestRegressor(random_state=0),
    'Gradient Boosting': GradientBoostingRegressor(random_state=0)
}

In [57]:
# Evaluate models
results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, train_X, train_y, val_X, val_y)


In [58]:
# Display results
for name, metrics in results.items():
    print(f"{name}:\n MSE: {metrics['MSE']}\n RMSE: {metrics['RMSE']}\n MAE: {metrics['MAE']}\n R2: {metrics['R2']}\n")


Linear Regression:
 MSE: 79267882611.96358
 RMSE: 281545.52493684494
 MAE: 180338.81985087434
 R2: 0.46421948256417855

Decision Tree:
 MSE: 147989621646.46497
 RMSE: 384694.1923742351
 MAE: 218308.63262618094
 R2: -0.0002784664883623833

Random Forest:
 MSE: 137744085717.48837
 RMSE: 371138.90353543963
 MAE: 211702.81508957502
 R2: 0.068972261051645

Gradient Boosting:
 MSE: 110086473487.67072
 RMSE: 331792.8171128343
 MAE: 195710.68523240194
 R2: 0.25591316704343103



In [59]:
# Ridge regression with cross-validation for hyperparameter tuning
def ridge_regression(alpha, train_X, train_y):
    ridge = Ridge(alpha=alpha)
    scores = cross_val_score(ridge, train_X, train_y, scoring='neg_mean_squared_error', cv=5)
    rmse = np.sqrt(-scores)
    return rmse
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
for i in alpha:
    print('Alpha:', i)
    print('Mean RMSE:', ridge_regression(i, train_X, train_y).mean())
    print('Standard Deviation:', ridge_regression(i, train_X, train_y).std())
    print()

Alpha: 0.001
Mean RMSE: 479921.7320665695
Standard Deviation: 273456.36721808853

Alpha: 0.01
Mean RMSE: 479921.7409817472
Standard Deviation: 273456.3477805569

Alpha: 0.1
Mean RMSE: 479921.8302051473
Standard Deviation: 273456.1533863713

Alpha: 1
Mean RMSE: 479922.7295964484
Standard Deviation: 273454.20755908085

Alpha: 10
Mean RMSE: 479932.4342832229
Standard Deviation: 273434.562199554

Alpha: 100
Mean RMSE: 480095.8377860383
Standard Deviation: 273220.7883234871

Alpha: 1000
Mean RMSE: 485282.97350416874
Standard Deviation: 270256.9703606517



In [60]:
# Evaluating Decision Tree with different max_leaf_nodes
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae
max_leaf_nodes = [5, 10, 50, 100, 200, 500, 1000]
for max_leaf_node in max_leaf_nodes:
    print("Max leaf nodes:", max_leaf_node, " -> MAE:", get_mae(max_leaf_node, train_X, val_X, train_y, val_y))


Max leaf nodes: 5  -> MAE: 185798.75332212908
Max leaf nodes: 10  -> MAE: 194525.26610323132
Max leaf nodes: 50  -> MAE: 201945.03977662878
Max leaf nodes: 100  -> MAE: 210508.67875857075
Max leaf nodes: 200  -> MAE: 215272.23075613703
Max leaf nodes: 500  -> MAE: 218279.7614966232
Max leaf nodes: 1000  -> MAE: 218308.63262618094
