# Task 1: Predictive Modeling (Regression)

### Description: Build and evaluate a regression model to predict a continuous variable (e.g., house prices).

- Split the dataset into training and testing sets.
- Train a linear regression model using scikit-learn.
- Evaluate the model using performance metrics like mean squared error (MSE) and R-squared.
- Experiment with multiple models (e.g., Decision Trees, Random Forest) and compare performance.

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
house_df = pd.read_csv("data/HousingData.csv")
house_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
house_df.isna().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         5
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

In [5]:
# Fixing nan values and assigning the features and target variables
house_df['rm'] = house_df["rm"].ffill()
X = house_df.drop(['medv'], axis=1)
y = house_df["medv"]

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Initialize and apply StandardScaler
scaler = StandardScaler()
# Fit only on training data, then transform both
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Train your regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

In [9]:
# Make predictions
y_pred_lr = lr_model.predict(X_test_scaled)

In [10]:
# Evaluate
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f'The mean absolute error is: {mae_lr:.2f}')
print(f'The mean square error is: {mse_lr:.2f}')
print(f'The R-squared score is: {r2_lr:.2f}')

The mean absolute error is: 3.20
The mean square error is: 24.36
The R-squared score is: 0.67


In [11]:
# LinearRegression metrics
lr_metrics = [mae_lr, mse_lr, r2_lr]

# Experimenting with multiple models

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [13]:
#
# ======== Decision Tree Regressor ========
#
# Initialize and train the model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test_scaled)

# Evaluate
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)
print("======== Decision Tree Regressor ========")
print(f'The mean absolute error is: {mae_dt:.2f}')
print(f'The mean square error is: {mse_dt:.2f}')
print(f'The R-squared score is: {r2_dt:.2f}')

# Metrics
dt_metrics = [mae_dt, mse_dt, r2_dt]

The mean absolute error is: 2.62
The mean square error is: 11.94
The R-squared score is: 0.84


In [14]:
#
# ======== Random Forest Regressor ========
#
# Initialize and train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print("======== Random Forest Regressor ========")
print(f'The mean absolute error is: {mae_rf:.2f}')
print(f'The mean square error is: {mse_rf:.2f}')
print(f'The R-squared score is: {r2_rf:.2f}')

# Metrics
rf_metrics = [mae_rf, mse_rf, r2_rf]

The mean absolute error is: 2.06
The mean square error is: 7.98
The R-squared score is: 0.89


# Comparing the models

In [15]:
metrics_dict = {'Linear Regression': lr_metrics,
               'Decision Tree': dt_metrics,
               'Random Forest': rf_metrics}
pd.DataFrame(metrics_dict, index = ["Mean Absolute Error", "Mean Square error", "R-Squared"])

Unnamed: 0,Linear Regression,Decision Tree,Random Forest
Mean Absolute Error,3.203097,2.62451,2.063716
Mean Square error,24.362874,11.943235,7.975973
R-Squared,0.667781,0.837139,0.891237
