House Price prediction

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib 

# Lets load the dataset
df = pd.read_csv(r'C:\Users\hp\Desktop\Locked\02. Data Science\07. Data Science Projects\ML Projects\Linear Regression Hausing Price 1\dataset\data.csv')  # Replace with your file path
print(df.head())

                  date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  2014-05-02 00:00:00   313000.0       3.0       1.50         1340      7912   
1  2014-05-02 00:00:00  2384000.0       5.0       2.50         3650      9050   
2  2014-05-02 00:00:00   342000.0       3.0       2.00         1930     11947   
3  2014-05-02 00:00:00   420000.0       3.0       2.25         2000      8030   
4  2014-05-02 00:00:00   550000.0       4.0       2.50         1940     10500   

   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
0     1.5           0     0          3        1340              0      1955   
1     2.0           0     4          5        3370            280      1921   
2     1.0           0     0          4        1930              0      1966   
3     1.0           0     0          4        1000           1000      1963   
4     1.0           0     0          4        1140            800      1976   

   yr_renovated                    str

In [2]:
df = df.drop(columns=['date', 'sqft_lot', 'yr_built', 'waterfront', 'view', 'condition', 'statezip', 'country', 'yr_renovated', 'street'])

In [3]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,sqft_above,sqft_basement,city
0,313000.0,3.0,1.5,1340,1.5,1340,0,Shoreline
1,2384000.0,5.0,2.5,3650,2.0,3370,280,Seattle
2,342000.0,3.0,2.0,1930,1.0,1930,0,Kent
3,420000.0,3.0,2.25,2000,1.0,1000,1000,Bellevue
4,550000.0,4.0,2.5,1940,1.0,1140,800,Redmond


Data Preprocessing

In [4]:
# Separate features and target
X = df.drop('price', axis=1)
y = df['price']

# We can Define categorical and numerical features
categorical_features = ['city']
numerical_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_above', 'sqft_basement', 'floors']

# Lets Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Now i split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# We now create pipeline with preprocessing and model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

#Let Train the model
model.fit(X_train, y_train)

#And Evaluate
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Training R²: {train_score:.3f}")
print(f"Testing R²: {test_score:.3f}")

Training R²: 0.623
Testing R²: 0.039


In [6]:
# Now lets Save the entire pipeline (including preprocessing)
joblib.dump(model, 'house_price_predictor.pkl')

#i start with saving just the model coefficients for inspection:
coefficients = pd.DataFrame({
    'feature': numerical_features + 
               list(model.named_steps['preprocessor']
                   .named_transformers_['cat']
                   .get_feature_names_out(categorical_features)),
    'coefficient': model.named_steps['regressor'].coef_
})
coefficients.to_csv('model_coefficients.csv', index=False)

In [7]:
#Now if i am to use the model later
import joblib
import pandas as pd

In [8]:
# we Load the saved model First
model = joblib.load('house_price_predictor.pkl')

In [9]:
def predict_price(input_data):
    """Predict house price from input dictionary"""
    # Convert to DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Make prediction
    price = model.predict(input_df)[0]
    return price

In [10]:
# Lets try
if __name__ == "__main__":
    # lets try some inputs
    new_house = {
        'bedrooms': 3,
        'bathrooms': 2,
        'sqft_living': 1500,
        'sqft_basement': 500,
        'sqft_above': 1,  # 1 for yes, 0 for no
        'city': 'New York',
        'floors' : 2
    }
    
    predicted_price = predict_price(new_house)
    print(f"Predicted Price: ${predicted_price:,.2f}")

Predicted Price: $201,925.95


In [11]:
# Lets try again
if __name__ == "__main__":
    # lets try some inputs
    new_house = {
        'bedrooms': 3,
        'bathrooms': 1.5,
        'sqft_living': 1500,
        'sqft_basement': 800,
        'sqft_above': 1,  # 1 for yes, 0 for no
        'city': 'New York',
        'floors' : 2
    }
    
    predicted_price = predict_price(new_house)
    print(f"Predicted Price: ${predicted_price:,.2f}")

Predicted Price: $232,860.97


In [19]:
#now creating a way that the model will prompt to the user to input requied details to get predicted price
while input("Predict (y/n)") == 'y':
    print (f"Price: $ {model.predict(pd.DataFrame([{'bedrooms' : int(input('Bedrooms: ')),
                                                     'bathrooms' : float(input('Bathrooms: ')), 
                                                     'sqft_living' : float(input('Sqft_living: ')), 
                                                     'sqft_basement' : float(input('Sqft_basement: ')), 
                                                     'sqft_above' : float(input('sqft_above: ')), 
                                                     'city' : str(input('City: ')), 
                                                     'floors' : str(input('Floors: '))}]))[0]:,.2f}")

Price: $ 147,263.01
