In [1]:
# Import all Libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier



In [2]:
# Load the dataset from the image
df = pd.read_csv("housing_data.csv")  # Replace with the actual path to your dataset

# Display the first 5 rows
df.head()




Unnamed: 0.1,Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,1,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,2,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,3,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,4,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,5,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [3]:
# Get information about the dataset
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  546 non-null    int64  
 1   price       546 non-null    float64
 2   lotsize     546 non-null    int64  
 3   bedrooms    546 non-null    int64  
 4   bathrms     546 non-null    int64  
 5   stories     546 non-null    int64  
 6   driveway    546 non-null    object 
 7   recroom     546 non-null    object 
 8   fullbase    546 non-null    object 
 9   gashw       546 non-null    object 
 10  airco       546 non-null    object 
 11  garagepl    546 non-null    int64  
 12  prefarea    546 non-null    object 
dtypes: float64(1), int64(6), object(6)
memory usage: 55.6+ KB


In [4]:
# Check for missing values
df.isnull().sum()



Unnamed: 0    0
price         0
lotsize       0
bedrooms      0
bathrms       0
stories       0
driveway      0
recroom       0
fullbase      0
gashw         0
airco         0
garagepl      0
prefarea      0
dtype: int64

In [5]:
# Describe the numerical features
df.describe()


Unnamed: 0.1,Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,garagepl
count,546.0,546.0,546.0,546.0,546.0,546.0,546.0
mean,273.5,68121.59707,5150.265568,2.965201,1.285714,1.807692,0.692308
std,157.760895,26702.670926,2168.158725,0.737388,0.502158,0.868203,0.861307
min,1.0,25000.0,1650.0,1.0,1.0,1.0,0.0
25%,137.25,49125.0,3600.0,2.0,1.0,1.0,0.0
50%,273.5,62000.0,4600.0,3.0,1.0,2.0,0.0
75%,409.75,82000.0,6360.0,3.0,2.0,2.0,1.0
max,546.0,190000.0,16200.0,6.0,4.0,4.0,3.0


In [6]:
# Handle missing values (if any)
df.fillna(method='ffill', inplace=True)  # Replace with a suitable imputation method

# Scale numerical features
scaler = StandardScaler()
df[['lotsize', 'bedrooms', 'bathrms', 'stories', 'garagepl']] = scaler.fit_transform(df[['lotsize', 'bedrooms', 'bathrms', 'stories', 'garagepl']])

# Convert categorical features to numerical
df = pd.get_dummies(df, columns=['driveway', 'recroom', 'fullbase', 'gashw', 'airco', 'prefarea'])


In [7]:

# Split the data into features and target variable
X = df.drop('price', axis=1)
y = df['price']
X


Unnamed: 0.1,Unnamed: 0,lotsize,bedrooms,bathrms,stories,garagepl,driveway_no,driveway_yes,recroom_no,recroom_yes,fullbase_no,fullbase_yes,gashw_no,gashw_yes,airco_no,airco_yes,prefarea_no,prefarea_yes
0,1,0.323028,0.047235,-0.569495,0.221704,0.357567,0,1,1,0,0,1,1,0,1,0,1,0
1,2,-0.531013,-1.310147,-0.569495,-0.931157,-0.804525,0,1,1,0,1,0,1,0,1,0,1,0
2,3,-0.964958,0.047235,-0.569495,-0.931157,-0.804525,0,1,1,0,1,0,1,0,1,0,1,0
3,4,0.692343,0.047235,-0.569495,0.221704,-0.804525,0,1,0,1,1,0,1,0,1,0,1,0
4,5,0.558466,-1.310147,-0.569495,-0.931157,-0.804525,0,1,1,0,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,542,-0.161698,0.047235,1.423737,2.527427,-0.804525,0,1,0,1,1,0,1,0,0,1,1,0
542,543,0.392275,0.047235,1.423737,2.527427,-0.804525,0,1,1,0,1,0,1,0,0,1,1,0
543,544,0.392275,0.047235,1.423737,2.527427,0.357567,0,1,0,1,1,0,1,0,0,1,1,0
544,545,0.392275,0.047235,1.423737,0.221704,0.357567,0,1,0,1,1,0,1,0,0,1,1,0


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [9]:
# Create and train the model (you can choose either Linear Regression or Decision Tree)
model = LinearRegression()
model.fit(X_train, y_train)



In [10]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred

array([ 67998.44131251,  66480.2349801 , 109742.96174725,  60627.29433151,
        57477.74952884,  58282.87888584,  55680.03662918,  54461.55569461,
        99635.13530918,  49569.8666801 ,  77831.58249061,  95836.00574876,
        53736.75544474,  63721.05582511,  98881.59616739, 109333.77157238,
        78176.90750526,  39693.88134893,  89854.35290808,  55183.97293018,
        33162.3585565 ,  49740.62023562,  77598.90571944,  52670.43538504,
        74044.16102562,  38461.88150595,  86769.7874388 ,  42894.64107676,
       100010.09513023,  61136.22962331,  50707.03159438,  77127.8351528 ,
        73470.26950219,  93039.73508842,  62178.09311663,  53490.71465907,
        34722.52092001,  67297.75619261,  70929.13020401, 104284.38926331,
        43843.40171834, 111191.04383074,  40603.14131928,  54693.60918996,
        77542.80736171,  59106.8074841 ,  55774.53220653,  54715.55759523,
        56668.68397693,  98913.0207753 ,  38986.91026669,  77413.5652231 ,
        77322.73023521,  

In [11]:
# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 254583183.32697207


In [12]:
# Calculate the R² score
score = model.score(X_test, y_test)
print("R² Score:", score)


R² Score: 0.6189304598660972


In [13]:
# Calculate the accuracy score
score = model.score(X_test, y_test)
print("Accuracy Score:", score)


Accuracy Score: 0.6189304598660972
