# House Price Project

* Author : Imran Younus
* Link : https://www.linkedin.com/in/imran-younus-031283111/

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression

# Current working directory
# print(os.getcwd())

# load dataset
data = pd.read_csv("house price data.csv")
print(data.head())

# Basic EDA

In [None]:
# Basic EDA
print(data.shape)
print(data.info())
print(data.describe())# Statistical summary of numeric columns

In [None]:
# Missing Values
print(data.isnull().sum())

In [None]:
# Only numeric columns filter
numeric_data = data.select_dtypes(include=['float64','int64'])
print(numeric_data)

In [None]:
# Correlation Heatmap
# Correlation matrix calculate karein
correlation = numeric_data.corr()
# print(correlation)
#sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
#plt.show()

# Correlation of the target column 'price' with other columns
price_correlation = correlation["price"].sort_values(ascending=False)
print(price_correlation)


# Visualization

In [None]:
# Visualize the correlations with a bar plot
price_correlation.drop("price").plot(kind="bar", figsize=(10, 6), color="skyblue")
plt.title("Correlation of Features with 'price'")
plt.ylabel("Correlation Coefficient")
plt.show()

In [None]:
# select the high correlation's feature with price

important_features = ['price', 'sqft_living', 'sqft_above', 'bathrooms', 'bedrooms']

# Sirf important columns ka data
data_selected = data[important_features]

# Output dekhein
print(data_selected.head())

# Scatter plot before removing outliers

In [None]:
# Plot for 'sqft_living' vs 'price'
plt.figure(figsize=(4, 3))
sns.scatterplot(x='sqft_living', y='price' , label='Unclean data', data=data)
plt.title('Price vs Square Foot Living Area')
plt.show()

# Plot for 'sqft_above' vs 'price'
plt.figure(figsize=(4, 3))
sns.scatterplot(x='sqft_above', y='price' ,label='Unclean data', data=data)
plt.title('Price vs  Area Square Foot Above')
plt.show()

# Plot for 'bathrooms' vs 'price'
plt.figure(figsize=(4, 3))
sns.scatterplot(x='bathrooms', y='price', label='Unclean data', data=data)
plt.title('Price vs Bathrooms')
plt.show()

# Plot for 'bedrooms' vs 'price'
plt.figure(figsize=(4, 3))
sns.scatterplot(x='bedrooms', y='price', label='Unclean data', data=data)
plt.title('Price vs bedrooms')
plt.show()

# Filter out the outliers

In [31]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = data[['price', 'sqft_living', 'sqft_above', 'bathrooms', 'bedrooms']].quantile(0.25)
Q3 = data[['price', 'sqft_living', 'sqft_above', 'bathrooms', 'bedrooms']].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Filter out the outliers
data_no_outliers = data[~((data[['price', 'sqft_living', 'sqft_above', 'bathrooms', 'bedrooms']] < (Q1 - 1.5 * IQR)) | (data[['price', 'sqft_living', 'sqft_above', 'bathrooms', 'bedrooms']] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Check the shape of the cleaned data
print(f"Original data shape: {data.shape}")
print(f"Cleaned data shape: {data_no_outliers.shape}")

Original data shape: (4600, 18)
Cleaned data shape: (4143, 18)


# Scatter plot after removing outliers

In [None]:
plt.figure(figsize=(4, 3))
sns.scatterplot(x=data_no_outliers['sqft_living'], y=data_no_outliers['price'], color='Teal', label='Cleaned Data')
plt.title('After Removing Outliers, Square Foot Living Area')
plt.xlabel('sqft_living')
plt.ylabel('Price')
plt.show()

plt.figure(figsize=(4, 3))
sns.scatterplot(x=data_no_outliers['sqft_above'], y=data_no_outliers['price'], color='Teal', label='Cleaned Data')
plt.title('After Removing Outliers, AreaSquare Foot Above')
plt.xlabel('sqft_above')
plt.ylabel('Price')
plt.show()

plt.figure(figsize=(4, 3))
sns.scatterplot(x=data_no_outliers['bathrooms'], y=data_no_outliers['price'], color='Teal', label='Cleaned Data')
plt.title('After Removing Outliers, Bathrooms')
plt.xlabel('bathrooms')
plt.ylabel('Price')
plt.show()


plt.figure(figsize=(4, 3))
sns.scatterplot(x=data_no_outliers['bedrooms'], y=data_no_outliers['price'], color='Teal', label='Cleaned Data')
plt.title('After Removing Outliers, Bedrooms')
plt.xlabel('bedrooms')
plt.ylabel('Price')
plt.show()

# Evaluate model performance on clean data

In [None]:
X_clean = data_no_outliers[['sqft_living','sqft_above','bathrooms','bedrooms' ]]
y_clean = data_no_outliers['price']

# Initializing the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_clean, y_clean)

# Get user input for each feature

In [None]:

sqft_living = float(input("Enter square footage of living area: "))
sqft_above = float(input("Enter square footage of above ground area: "))
bathrooms = float(input("Enter number of bathrooms: "))
bedrooms = float(input("Enter number of bedrooms: "))

# Combine the inputs into an array
user_input = np.array([[sqft_living, sqft_above, bathrooms, bedrooms]])

# Predict the price based on user input
predicted_price = model.predict(user_input)

# Display the predicted price
print(f"The predicted price for the given inputs is: RS={predicted_price[0]:,.2f}")