# Importing necessary Libraries

In [144]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

# Load the data

In [118]:
data = pd.read_csv('House_Data.csv')

In [119]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# Initial data exploration and cleaning steps

In [120]:
data.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True)
data['location'] = data['location'].fillna('Sarjapur Road')
data['size'] = data['size'].fillna('2 BHK')
data['bath'] = data['bath'].fillna(data['bath'].median())

In [121]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


# Feature engineering

In [122]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

# Function to convert range to average

In [123]:
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1])) / 2
    try:
        return float(x)
    except:
        return None


In [124]:
data['total_sqft'] = data['total_sqft'].apply(convertRange)

In [125]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


# Feature engineering for price per square feet

In [126]:
data['price_per_sqft'] = data['price'] * 100000 / data['total_sqft']

In [127]:
data['location']=data['location'].apply(lambda x:x.strip())
location_count=data['location'].value_counts()

In [128]:
location_count_less_10=location_count[location_count<=10]
location_count=data['location'].value_counts()

In [129]:
data['location']=data['location'].apply(lambda x:'other' if x in location_count_less_10 else x)

data['location'].value_counts()

location
other                 2886
Whitefield             541
Sarjapur  Road         399
Electronic City        304
Kanakpura Road         273
                      ... 
Nehru Nagar             11
Banjara Layout          11
LB Shastri Nagar        11
Pattandur Agrahara      11
Narayanapura            11
Name: count, Length: 242, dtype: int64

In [130]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


# outlier detection and removal

In [131]:
data=data[((data['total_sqft']/data['bhk'])>=300)]

In [132]:
data.sample(6)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
6142,Thanisandra,3 BHK,1732.0,3.0,85.73,3,4949.769053
2661,other,2 Bedroom,1600.0,2.0,110.0,2,6875.0
8262,Kannamangala,3 BHK,1536.0,3.0,104.0,3,6770.833333
4818,KR Puram,2 Bedroom,1000.0,2.0,60.0,2,6000.0
6118,Raja Rajeshwari Nagar,3 BHK,1550.0,3.0,52.45,3,3383.870968
10489,Sarjapur,3 Bedroom,2690.0,3.0,295.0,3,10966.542751


# Handling outliers

In [133]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output

data = remove_outliers_sqft(data)

In [134]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

data = bhk_outlier_remover(data)


# Preparing data for modeling

In [135]:
data=bhk_outlier_remover(data)

In [136]:
data.drop(columns=['size', 'price_per_sqft'], inplace=True)

In [137]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


# Save cleaned data to a new CSV file

In [138]:
data.to_csv("Cleaned_data.csv", index=False)

# Machine learning model preparation and evaluation

In [139]:
X = data.drop(columns=['price'])
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder='passthrough')
scaler = StandardScaler()

# Linear Regression

In [140]:
lr=LinearRegression()

pipe=make_pipeline(column_trans,scaler,lr)

pipe.fit(X_train,y_train)

y_pred_lr=pipe.predict(X_test)

r2_score(y_test,y_pred_lr)




0.8680431429980424

# Applying Lasso

In [141]:
lasso=Lasso()

pipe=make_pipeline(column_trans,scaler,lasso)

pipe.fit(X_train,y_train)

y_pred_lasso=pipe.predict(X_test)
r2_score(y_test,y_pred_lasso)



0.8564661350035035

# Applying Ridge

In [142]:
ridge=Ridge()

pipe=make_pipeline(column_trans,scaler,ridge)

pipe.fit(X_train,y_train)

y_pred_ridge=pipe.predict(X_test)

r2_score(y_test,y_pred_ridge)



0.8686940791643926

In [143]:
print("No Regularization:",r2_score(y_test,y_pred_lr))
print("Lasso :",r2_score(y_test,y_pred_lasso))
print("Ridge:",r2_score(y_test,y_pred_ridge))

No Regularization: 0.8680431429980424
Lasso : 0.8564661350035035
Ridge: 0.8686940791643926
