In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
data=pd.read_csv(f'house_prices_third')
data
pd.set_option("display.max_rows", None)

In [3]:
data

Unnamed: 0,ID,Bedrooms,SqFt,Location,YearBuilt,Price
0,1,2,5036,2,1987,720537
1,2,4,4855,2,2011,934614
2,3,2,2093,0,1996,444449
3,4,1,4892,4,1985,641913
4,5,2,4881,4,2020,769609
5,6,1,2935,3,2018,476242
6,7,3,3558,4,1968,661515
7,8,5,4818,0,1993,962665
8,9,5,2306,1,1967,704313
9,10,2,4768,4,2019,779366


In [4]:
# Drop duplicates
data.drop_duplicates(inplace=True)

In [5]:
# --- Feature Engineering ---
data['House_Age'] = pd.Timestamp.now().year - data['YearBuilt']
data['Price_per_SqFt'] = data['Price'] / data['SqFt']
data['Total_rooms'] = data['Bedrooms']  # assuming no other room info given

In [6]:
# Drop columns unlikely to help or that leak info
data.drop(columns=['YearBuilt'], inplace=True)

In [7]:
# Drop rows with missing target
data.dropna(subset=['Price'], inplace=True)

In [8]:
# --- Split Features and Target ---
X = data.drop(columns=['Price'])
y = data['Price']

In [9]:
# --- Preprocessing Pipeline ---
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [10]:
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [12]:

# --- Model Pipeline ---
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [13]:
# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Training ---
model.fit(X_train, y_train)


In [14]:
# --- Evaluation ---
y_pred = model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")

MAE: 24076.78
MSE: 899013882.36
RMSE: 29983.56
R^2 Score: 0.98


In [15]:
# --- Cross Validation ---
scores = cross_val_score(model, X, y, scoring='r2', cv=5)
print(f"Average R^2 CV Score: {np.mean(scores):.2f}")

Average R^2 CV Score: 0.98


In [16]:
# --- Save Model ---
joblib.dump(model, 'refined_price_model.pkl')

['refined_price_model.pkl']

In [17]:
# House Price Prediction API using FastAPI
!pip install fastapi uvicorn
from fastapi import FastAPI
from pydantic import BaseModel




In [18]:
# Load the saved model
model = joblib.load('refined_price_model.pkl')

In [19]:
# Define input data schema
class HouseData(BaseModel):
    SqFt: float
    Bedrooms: int
    ID: int
    Location: float  # Adjust if it's a string in your data

    # Derived fields will be calculated inside the API
    YearBuilt: int

In [20]:
# Create FastAPI app
app = FastAPI(title="House Price Predictor API")

@app.post("/predict")
def predict_price(data: HouseData):
    # Feature engineering
    current_year = pd.Timestamp.now().year
    df = pd.DataFrame([{
        "SqFt": data.SqFt,
        "Bedrooms": data.Bedrooms,
        "ID": data.ID,
        "Location": data.Location,
        "House_Age": current_year - data.YearBuilt,
        "Price_per_SqFt": 0,  # Placeholder, not used directly
        "Total_rooms": data.Bedrooms
    }])

In [21]:
import pandas as pd
import joblib

# Load model
model = joblib.load("refined_price_model.pkl")

# Create app
app = FastAPI()

# Define input structure
class HouseData(BaseModel):
    SqFt: float
    Bedrooms: int
    ID: int
    Location: float
    YearBuilt: int

# Define route
@app.post("/predict")
def predict_price(data: HouseData):
    current_year = pd.Timestamp.now().year

    df = pd.DataFrame([{
        "SqFt": data.SqFt,
        "Bedrooms": data.Bedrooms,
        "ID": data.ID,
        "Location": data.Location,
        "House_Age": current_year - data.YearBuilt,
        "Price_per_SqFt": 0,  # placeholder
        "Total_rooms": data.Bedrooms
    }])

    prediction = model.predict(df)
    return {"predicted_price": round(prediction[0], 2)}


In [22]:
# Your input values
sqft = 1800
bedrooms = 3
location = 2.5
ID = 101
year_built = 2010

# Reconstruct full input with all required features
current_year = pd.Timestamp.now().year
house_age = current_year - year_built
price_per_sqft = 0  # Placeholder, not used during prediction

# Prepare input DataFrame with all columns used during training
new_data = pd.DataFrame([{
    "SqFt": sqft,
    "Bedrooms": bedrooms,
    "ID": ID,
    "Location": location,
    "House_Age": house_age,
    "Price_per_SqFt": price_per_sqft,  # included for structure
    "Total_rooms": bedrooms
}])

# Load and predict
model = joblib.load('refined_price_model.pkl')
predicted_price = model.predict(new_data)

print(f"Predicted Price: Rs={predicted_price[0]:,.2f}")


Predicted Price: Rs=474,674.20
