##MACHINE_LEARNING_PROJECT_TASK# 01 :-"HOUSE PRICE PREDICTION"

**IMPORT THE DATASETS**

In [4]:
import pandas as pd
import numpy as np

house_data = pd.read_csv('train.csv')
house_data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


**#DATA PREPROCESSING AND CLEANING**

In [5]:
house= house_data[['size', 'beds', 'baths', 'price']]
house= house.rename(columns={'size':'SquareFootage', 'beds':'Bedrooms', 'baths':'Bathrooms', 'price':'price'})

In [6]:
house.head()

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,price
0,2590.0,3,2.5,795000.0
1,2240.0,4,2.0,915000.0
2,2040.0,4,3.0,950000.0
3,3800.0,4,3.0,1950000.0
4,1042.0,2,2.0,950000.0


In [7]:
house.tail()

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,price
2011,1370.0,3,2.0,910000.0
2012,889.0,1,1.0,550000.0
2013,2140.0,4,2.0,1150000.0
2014,795.0,2,2.0,590000.0
2015,1710.0,3,2.0,659000.0


In [8]:
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SquareFootage  2016 non-null   float64
 1   Bedrooms       2016 non-null   int64  
 2   Bathrooms      2016 non-null   float64
 3   price          2016 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 63.1 KB


In [9]:
house.describe()

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,price
count,2016.0,2016.0,2016.0,2016.0
mean,1735.740575,2.857639,2.15997,963625.2
std,920.132591,1.255092,1.002023,944095.4
min,250.0,1.0,0.5,159000.0
25%,1068.75,2.0,1.5,601750.0
50%,1560.0,3.0,2.0,800000.0
75%,2222.5,4.0,2.5,1105250.0
max,11010.0,15.0,9.0,25000000.0


In [10]:
house.isna().sum()

SquareFootage    0
Bedrooms         0
Bathrooms        0
price            0
dtype: int64

In [11]:
house.duplicated().sum()

32

In [12]:
house.drop_duplicates(inplace=True)

In [13]:
house.duplicated().sum()

0

**EXPLORATORY ANALYSIS AND VISUALIZATION:**

**IMPORT LIBRARIES**

In [14]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [15]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [16]:
fig = px.histogram(house,
                   x='SquareFootage',
                   marginal='box',
                   nbins=47,
                   title='Distribution of SquareFootage')
fig.update_layout(bargap=0.1)
fig.show()

In [17]:
fig = px.histogram(house,
                   x='Bathrooms',
                   marginal='box',
                   nbins=47,
                   color_discrete_sequence=['green'],
                   title='Distribution of Bathrooms')
fig.update_layout(bargap=0.1)
fig.show()

In [18]:
fig = px.histogram(house,
                   x='price',
                   marginal='box',
                   nbins=47,
                   color_discrete_sequence=['black'],
                   title='Distribution of Price')
fig.update_layout(bargap=0.1)
fig.show()

In [19]:
fig = px.scatter(house,
                 x='SquareFootage',
                 y='price',
                 opacity=0.8,
                 title='SquareFootage vs. Price')
fig.update_traces(marker_size=5)
fig.show()

In [20]:
fig = px.scatter(house,
                 x='Bedrooms',
                 y='price',
                 opacity=0.8,
                 title='Bedrooms vs. Price')
fig.update_traces(marker_size=5)
fig.show()

In [22]:
house.corr()


Unnamed: 0,SquareFootage,Bedrooms,Bathrooms,price
SquareFootage,1.0,0.771003,0.666942,0.443082
Bedrooms,0.771003,1.0,0.651903,0.292381
Bathrooms,0.666942,0.651903,1.0,0.31635
price,0.443082,0.292381,0.31635,1.0


In [24]:
inputs = house[house.columns[:-1]]
target = house['price']

In [25]:
inputs.shape

(1984, 3)

In [26]:
inputs.head()

Unnamed: 0,SquareFootage,Bedrooms,Bathrooms
0,2590.0,3,2.5
1,2240.0,4,2.0
2,2040.0,4,3.0
3,3800.0,4,3.0
4,1042.0,2,2.0


In [27]:
target

0        795000.0
1        915000.0
2        950000.0
3       1950000.0
4        950000.0
          ...    
2011     910000.0
2012     550000.0
2013    1150000.0
2014     590000.0
2015     659000.0
Name: price, Length: 1984, dtype: float64

In [28]:
column =['SquareFootage', 'Bedrooms', 'Bathrooms']

**SCALING OF NUMERICAL COLUMNS**

In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(inputs[column])
inputs = scaler.transform(inputs[column])

In [30]:
inputs

array([[ 0.91741049,  0.10378156,  0.33470209],
       [ 0.53788162,  0.89877236, -0.16271285],
       [ 0.32100797,  0.89877236,  0.83211702],
       ...,
       [ 0.42944479,  0.89877236, -0.16271285],
       [-1.02903046, -0.69120924, -0.16271285],
       [-0.03683354,  0.10378156, -0.16271285]])

In [31]:
inputs.shape

(1984, 3)

In [32]:
target.shape

(1984,)

**SPLIT DATASET INTO TRAIN AND TEST DATASET**

In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inputs, target, random_state=42, test_size=0.2)
print("shape of x_train", x_train.shape)
print("shape of y_train", y_train.shape)
print("shape of x_test", x_test.shape)
print("shape of y_test", y_test.shape)


shape of x_train (1587, 3)
shape of y_train (1587,)
shape of x_test (397, 3)
shape of y_test (397,)


**TRAIN & EVALUATE THE LINEAR REGRESSION MODEL**

In [34]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)


In [35]:
# predict the value at test datasets
preds = model.predict(x_test)

**ACCURACY OF MODEL AT TEST DATASETS**

In [36]:
# check the accuracy of model at test datasets
from sklearn.metrics import mean_squared_error, r2_score
print("Model: Linear Regression")
print("Accuracy = {:0.2f}%".format(r2_score(y_test, preds)*100))
print("Mean Squared Error = {:0.2f}\n".format(mean_squared_error(y_test, preds, squared=False)))

Model: Linear Regression
Accuracy = 62.61%
Mean Squared Error = 326365.42



**TRAIN & EVALUATE THE RIDGE MODEL**

In [37]:
from sklearn.linear_model import Ridge
model2 = Ridge(alpha=0.6)
model2.fit(x_train, y_train)

In [38]:
r_preds = model2.predict(x_test)

**ACCURACY OF MODEL ON TEST DATASET**

In [39]:
print("Model: Linear Ridge")
print("Accuracy = {:0.2f}%".format(r2_score(y_test, r_preds)*100))
print("Mean Squared Error = {:0.2f}\n".format(mean_squared_error(y_test, r_preds, squared=False)))

Model: Linear Ridge
Accuracy = 62.62%
Mean Squared Error = 326324.01



**TRAIN & EVALUATE THE LASSO MODEL**

In [40]:
from sklearn.linear_model import Lasso
model2 = Lasso(alpha=0.6)
model2.fit(x_train, y_train)
l_preds = model2.predict(x_test)

**ACCURACY OF MODEL AT TEST DATASETS**

In [41]:
print("Model: Linear Regression")
print("Accuracy = {:0.2f}%".format(r2_score(y_test, l_preds)*100))
print("Mean Squared Error = {:0.2f}\n".format(mean_squared_error(y_test, l_preds, squared=False)))

Model: Linear Regression
Accuracy = 62.61%
Mean Squared Error = 326365.32



**PREDICT THE HOUSE PRICE OF HOUSE BY USING NEW INPUTS**

In [42]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df = scaler.transform(input_df)
    pred = model2.predict(input_df)
    return pred[0]

In [43]:
new_input = {'SquareFootage': 2540,
             'Bedrooms': 5,
             'Bathrooms': 6,
        }

In [44]:
predict_input(new_input)

1415848.8845684342

**HOME_PRICER CALCULATER**

In [45]:
columns = ['SquareFootage', 'Bedrooms', 'Bathrooms']

# Separate features and target variable
inputs = house[columns]
target = house['price']

# Scale numerical columns
scaler = StandardScaler()
scaler.fit(inputs)
inputs_scaled = scaler.transform(inputs)

# Train the Lasso model
model = Lasso(alpha=0.6)
model.fit(inputs_scaled, target)

def predict_house_price(square_footage, bedrooms, bathrooms):
    # Scale the user input data
    input_data = scaler.transform([[square_footage, bedrooms, bathrooms]])

    # Predict the house price
    predicted_price = model.predict(input_data)

    return predicted_price[0]

# Take user inputs
input_square_footage = float(input("Enter the square footage: "))
input_bedrooms = int(input("Enter the number of bedrooms: "))
input_bathrooms = float(input("Enter the number of bathrooms: "))

# Predict house price based on user inputs
predicted_price = predict_house_price(input_square_footage, input_bedrooms, input_bathrooms)
print(f"The predicted price for a house with {input_square_footage} square feet, {input_bedrooms} bedrooms, and {input_bathrooms} bathrooms is ${predicted_price:.2f}")


Enter the square footage: 4679
Enter the number of bedrooms: 12
Enter the number of bathrooms: 15
The predicted price for a house with 4679.0 square feet, 12 bedrooms, and 15.0 bathrooms is $2384652.35



X does not have valid feature names, but StandardScaler was fitted with feature names

