In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('mandar_gurgaon_properties_missing_value_imputation.csv')

In [4]:
df.shape

(3554, 18)

In [5]:
df.head()

Unnamed: 0,property_type,society,sector,price,price_per_sqft,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score
0,flat,umang monsoon breeze,sector 78,0.75,6053.0,2,2,1,5.0,Moderately Old,1239.0,0,0,0,0,0,0,0
1,flat,ireo skyon,sector 60,2.15,23344.0,2,3,2,5.0,Relatively New,1250.0,0,1,0,0,0,1,49
2,flat,dlf regal gardens,sector 90,1.23,7053.0,3,3,2,19.0,Relatively New,1578.0,0,0,0,0,1,0,6
3,house,independent,sector 2,5.0,15385.0,6,6,2,1.0,Moderately Old,3611.0,0,0,0,0,0,0,0
4,flat,dlf the arbour,sector 63,7.52,18999.0,4,4,3,15.0,New Property,3956.0,0,0,0,0,0,0,61


In [6]:
train_df = df.drop(columns=['society','price_per_sqft'])

In [7]:
train_df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score
0,flat,sector 78,0.75,2,2,1,5.0,Moderately Old,1239.0,0,0,0,0,0,0,0
1,flat,sector 60,2.15,2,3,2,5.0,Relatively New,1250.0,0,1,0,0,0,1,49
2,flat,sector 90,1.23,3,3,2,19.0,Relatively New,1578.0,0,0,0,0,1,0,6
3,house,sector 2,5.0,6,6,2,1.0,Moderately Old,3611.0,0,0,0,0,0,0,0
4,flat,sector 63,7.52,4,4,3,15.0,New Property,3956.0,0,0,0,0,0,0,61


In [8]:
X_label = train_df.drop('price', axis=1)
y_label = train_df['price']

In [9]:
X_label

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score
0,flat,sector 78,2,2,1,5.0,Moderately Old,1239.0,0,0,0,0,0,0,0
1,flat,sector 60,2,3,2,5.0,Relatively New,1250.0,0,1,0,0,0,1,49
2,flat,sector 90,3,3,2,19.0,Relatively New,1578.0,0,0,0,0,1,0,6
3,house,sector 2,6,6,2,1.0,Moderately Old,3611.0,0,0,0,0,0,0,0
4,flat,sector 63,4,4,3,15.0,New Property,3956.0,0,0,0,0,0,0,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 113,5,4,3+,0.0,New Property,2956.0,0,0,0,1,0,2,44
3550,flat,dwarka expressway,3,3,2,3.0,Under Construction,1267.0,0,0,0,0,0,0,0
3551,flat,sector 50,3,4,3+,6.0,Moderately Old,2470.0,0,1,0,0,0,1,174
3552,flat,sector 86,3,3,3,12.0,Moderately Old,1747.0,0,1,0,1,0,0,106


In [10]:
y_label

0       0.75
1       2.15
2       1.23
3       5.00
4       7.52
        ... 
3549    3.72
3550    1.30
3551    2.65
3552    1.30
3553    1.55
Name: price, Length: 3554, dtype: float64

In [11]:
export_df = X_label.drop(columns=['pooja room', 'study room', 'others'])
export_df['price'] = y_label

In [17]:
export_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_score,price
0,flat,sector 78,2,2,1,5.0,Moderately Old,1239.0,0,0,0,0,0.75
1,flat,sector 60,2,3,2,5.0,Relatively New,1250.0,1,0,1,49,2.15
2,flat,sector 90,3,3,2,19.0,Relatively New,1578.0,0,0,0,6,1.23
3,house,sector 2,6,6,2,1.0,Moderately Old,3611.0,0,0,0,0,5.00
4,flat,sector 63,4,4,3,15.0,New Property,3956.0,0,0,0,61,7.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 113,5,4,3+,0.0,New Property,2956.0,0,0,2,44,3.72
3550,flat,dwarka expressway,3,3,2,3.0,Under Construction,1267.0,0,0,0,0,1.30
3551,flat,sector 50,3,4,3+,6.0,Moderately Old,2470.0,1,0,1,174,2.65
3552,flat,sector 86,3,3,3,12.0,Moderately Old,1747.0,1,0,0,106,1.30


In [18]:
# we will convert the luxury score into categorical values - Low, Medium, High

def categorize_luxury(score):
    if 0 <= score < 50:
        return "Low"
    elif 50 <= score < 150:
        return "Medium"
    elif 150 <= score <= 175:
        return "High"
    else:
        return None  # or "Undefined" or any other label for scores outside the defined bins

In [19]:
export_df['luxury_category'] = export_df['luxury_score'].apply(categorize_luxury)

In [21]:
# converting the floors into categories

def categorize_floor(floor):
    if 0 <= floor <= 2:
        return "Low Floor"
    elif 3 <= floor <= 10:
        return "Mid Floor"
    elif 11 <= floor <= 51:
        return "High Floor"
    else:
        return None  # or "Undefined" or any other label for floors outside the defined bins

In [22]:
export_df['floor_category'] = export_df['floorNum'].apply(categorize_floor)

In [23]:
export_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_score,price,luxury_category,floor_category
0,flat,sector 78,2,2,1,5.0,Moderately Old,1239.0,0,0,0,0,0.75,Low,Mid Floor
1,flat,sector 60,2,3,2,5.0,Relatively New,1250.0,1,0,1,49,2.15,Low,Mid Floor
2,flat,sector 90,3,3,2,19.0,Relatively New,1578.0,0,0,0,6,1.23,Low,High Floor
3,house,sector 2,6,6,2,1.0,Moderately Old,3611.0,0,0,0,0,5.00,Low,Low Floor
4,flat,sector 63,4,4,3,15.0,New Property,3956.0,0,0,0,61,7.52,Medium,High Floor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 113,5,4,3+,0.0,New Property,2956.0,0,0,2,44,3.72,Low,Low Floor
3550,flat,dwarka expressway,3,3,2,3.0,Under Construction,1267.0,0,0,0,0,1.30,Low,Mid Floor
3551,flat,sector 50,3,4,3+,6.0,Moderately Old,2470.0,1,0,1,174,2.65,High,Mid Floor
3552,flat,sector 86,3,3,3,12.0,Moderately Old,1747.0,1,0,0,106,1.30,Medium,High Floor


In [24]:
export_df.drop(columns=['floorNum','luxury_score'],inplace=True)

In [25]:
export_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,price,luxury_category,floor_category
0,flat,sector 78,2,2,1,Moderately Old,1239.0,0,0,0,0.75,Low,Mid Floor
1,flat,sector 60,2,3,2,Relatively New,1250.0,1,0,1,2.15,Low,Mid Floor
2,flat,sector 90,3,3,2,Relatively New,1578.0,0,0,0,1.23,Low,High Floor
3,house,sector 2,6,6,2,Moderately Old,3611.0,0,0,0,5.00,Low,Low Floor
4,flat,sector 63,4,4,3,New Property,3956.0,0,0,0,7.52,Medium,High Floor
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 113,5,4,3+,New Property,2956.0,0,0,2,3.72,Low,Low Floor
3550,flat,dwarka expressway,3,3,2,Under Construction,1267.0,0,0,0,1.30,Low,Mid Floor
3551,flat,sector 50,3,4,3+,Moderately Old,2470.0,1,0,1,2.65,High,Mid Floor
3552,flat,sector 86,3,3,3,Moderately Old,1747.0,1,0,0,1.30,Medium,High Floor


In [26]:
export_df.to_csv('mandar_gurgaon_properties_post_feature_selection_v2.csv', index=False)

In [27]:
df1 = pd.read_csv('mandar_gurgaon_properties_post_feature_selection_v2.csv')
df2 = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [28]:
import pandas as pd

# Assuming df1 and df2 are your DataFrames

# Perform an inner join to find common rows
common_rows = pd.merge(df1, df2, how='inner')

# Count the number of common rows
number_of_common_rows = len(common_rows)

print("Number of identical rows in both dataframes:", number_of_common_rows)


Number of identical rows in both dataframes: 3454


In [29]:
df1

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,price,luxury_category,floor_category
0,flat,sector 78,2,2,1,Moderately Old,1239.0,0,0,0,0.75,Low,Mid Floor
1,flat,sector 60,2,3,2,Relatively New,1250.0,1,0,1,2.15,Low,Mid Floor
2,flat,sector 90,3,3,2,Relatively New,1578.0,0,0,0,1.23,Low,High Floor
3,house,sector 2,6,6,2,Moderately Old,3611.0,0,0,0,5.00,Low,Low Floor
4,flat,sector 63,4,4,3,New Property,3956.0,0,0,0,7.52,Medium,High Floor
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 113,5,4,3+,New Property,2956.0,0,0,2,3.72,Low,Low Floor
3550,flat,dwarka expressway,3,3,2,Under Construction,1267.0,0,0,0,1.30,Low,Mid Floor
3551,flat,sector 50,3,4,3+,Moderately Old,2470.0,1,0,1,2.65,High,Mid Floor
3552,flat,sector 86,3,3,3,Moderately Old,1747.0,1,0,0,1.30,Medium,High Floor


In [30]:
df2

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.60,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,0.37,2.0,2.0,1,Relatively New,532.0,0.0,0.0,0.0,Medium,Mid Floor
3550,house,sector 109,6.00,5.0,5.0,3+,Relatively New,6228.0,1.0,1.0,0.0,High,Low Floor
3551,flat,sector 2,0.60,1.0,1.0,1,Moderately Old,665.0,0.0,0.0,1.0,Medium,Mid Floor
3552,house,sector 43,15.50,5.0,6.0,3,Moderately Old,5490.0,1.0,1.0,0.0,Medium,Mid Floor
