In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

#visualizations
sns.set(style="whitegrid")

In [3]:
# heres a dataset i have loaded
data = pd.read_csv('../data/house_prices.csv')

# this dataset
print(data.head())
print(data.info())
print(data.describe())


                                      Name  \
0                         Casagrand ECR 14   
1    Ramanathan Nagar, Pozhichalur,Chennai   
2                              DAC Prapthi   
3  Naveenilaya,Chepauk, Triplicane,Chennai   
4                 VGN Spring Field Phase 1   

                                      Property Title     Price  \
0  4 BHK Flat for sale in Kanathur Reddikuppam, C...  ₹1.99 Cr   
1  10 BHK Independent House for sale in Pozhichal...  ₹2.25 Cr   
2      3 BHK Flat for sale in West Tambaram, Chennai   ₹1.0 Cr   
3  7 BHK Independent House for sale in Triplicane...  ₹3.33 Cr   
4              2 BHK Flat for sale in Avadi, Chennai   ₹48.0 L   

                                   Location  Total_Area  Price_per_SQFT  \
0             Kanathur Reddikuppam, Chennai        2583          7700.0   
1     Ramanathan Nagar, Pozhichalur,Chennai        7000          3210.0   
2  Kasthuribai Nagar, West Tambaram,Chennai        1320          7580.0   
3   Naveenilaya,Chepauk, T

In [4]:
# cheacking Mumbai listings are in the dataset
mumbai_data = data[data['Location'].str.contains("Mumbai", case=False, na=False)]
print(mumbai_data)


                                                   Name  \
6648                             Greenfield Green Ridge   
6649                                 Colombia Apartment   
6650               Ruki Mahal,Navy Nagar, Colaba,Mumbai   
6651                              Transcon Tirumala Sky   
6652                               Shree Shakun Heights   
...                                                 ...   
8002                            Ashish Swapnalok Towers   
8003  Chakala Industrial Area (MIDC), Andheri East,M...   
8004    Dindoshi shivshahi ,Dindoshi, Malad East,Mumbai   
8005                      Vakola, Santacruz East,Mumbai   
8006                    Tembhipada, Bhandup West,Mumbai   

                                         Property Title     Price  \
6648          2 BHK Flat for sale in Virar West, Mumbai   ₹48.0 L   
6649         3 BHK Flat for sale in Bandra West, Mumbai   ₹5.7 Cr   
6650              1 BHK Flat for sale in Colaba, Mumbai  ₹2.45 Cr   
6651         3 

In [5]:
# Checking missing values
missing_data = data.isnull().sum()
print(missing_data[missing_data > 0])

Series([], dtype: int64)


In [12]:
# Fill missing values for numerical columns only
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Drop columns with too many missing values or non-informative columns, ignoring errors if columns are not present
data.drop(columns=["Alley", "PoolQC", "Fence", "MiscFeature"], inplace=True, errors="ignore")


In [13]:
# Converting categorical variables using one-hot encoding
data = pd.get_dummies(data)
print(data.shape)

(14528, 38941)


In [14]:
#Now preparing for traning and test set
# Separating the features (X) and the target variable (y), and then spliting our data.

X = data.drop("SalePrice", axis=1) # Define the features (X)
y = data["SalePrice"] # target variable (y)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: "['SalePrice'] not found in axis"

In [15]:
print(data.columns)  # see all columns in `data`

Index(['Total_Area', 'Price_per_SQFT', 'Baths',
       'Name_    Madhurangan Apartment ,Ambegaon, Pune',
       'Name_   Manganahalli    Sriram Layout ,Ullal Uppanagar, Bangalore',
       'Name_   sona Building,Bhayandar West, Mumbai', 'Name_  Oxford Blues',
       'Name_  Sec 2 Pooja apartment Bhosari ,Indrayani Nagar Sector 2, Bhosari,Pune',
       'Name_ A Knight Ventures Sachh by A Knight Reliant India Pvt Ltd',
       'Name_ A N SWAGATH,Gubbalala, Subramanyapura,Bangalore',
       ...
       'Description_▪︎ 1 BHK house (fully furnished ready to move) for sale\n▪︎ Building no. 41, 2nd Floor\nTilak Nagar, Chembur\n▪︎ Area - 370 Sq Ft Usable Carpet\n▪︎ 24 hrs water supply \n▪︎ Nearby locality - Play ground, School, Grocery shops, Restaurant\n▪︎ Price - 1.10 Cr (Negotiable) More About This Property Best 1 BHK Apartment for modern-day lifestyle is now available for sale. No brokerage involved, Posted by Owner. Grab this 1 BHK property for sale in one of Mumbai's top location, Kurla. It

In [16]:
y = data["SalePrice"]
X = data.drop("SalePrice", axis=1)
X = pd.get_dummies(X)

KeyError: 'SalePrice'

In [17]:
y = data["SalePrice"]
X = data.drop("SalePrice", axis=1)
X = pd.get_dummies(X)


KeyError: 'SalePrice'

In [18]:
print(data.columns)


Index(['Total_Area', 'Price_per_SQFT', 'Baths',
       'Name_    Madhurangan Apartment ,Ambegaon, Pune',
       'Name_   Manganahalli    Sriram Layout ,Ullal Uppanagar, Bangalore',
       'Name_   sona Building,Bhayandar West, Mumbai', 'Name_  Oxford Blues',
       'Name_  Sec 2 Pooja apartment Bhosari ,Indrayani Nagar Sector 2, Bhosari,Pune',
       'Name_ A Knight Ventures Sachh by A Knight Reliant India Pvt Ltd',
       'Name_ A N SWAGATH,Gubbalala, Subramanyapura,Bangalore',
       ...
       'Description_▪︎ 1 BHK house (fully furnished ready to move) for sale\n▪︎ Building no. 41, 2nd Floor\nTilak Nagar, Chembur\n▪︎ Area - 370 Sq Ft Usable Carpet\n▪︎ 24 hrs water supply \n▪︎ Nearby locality - Play ground, School, Grocery shops, Restaurant\n▪︎ Price - 1.10 Cr (Negotiable) More About This Property Best 1 BHK Apartment for modern-day lifestyle is now available for sale. No brokerage involved, Posted by Owner. Grab this 1 BHK property for sale in one of Mumbai's top location, Kurla. It

In [21]:
# Handle missing values
# Filling numeric columns with the median
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

In [23]:
columns_to_drop = ["Alley", "PoolQC", "Fence", "MiscFeature"]
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])
# Drop columns with too many missing values (only if these columns exist)

In [25]:
# Separate the target variable ('SalePrice') from the features
y = data["SalePrice"]  # target variable
X = data.drop("SalePrice", axis=1)  # features by dropping 'SalePrice'

KeyError: 'SalePrice'