In [26]:
import numpy as np
import pandas as pd

In [27]:
df = pd.read_csv("surat_uncleaned.csv")

In [28]:
df["BHK"] = df["property_name"].str.extract(r'(\d+)\s*BHK').astype(float)

In [29]:
df["Location"] = df["property_name"].str.extract(r'in\s+([\w\s]+)$',expand = False).str.strip()

In [30]:
df["area_type"] = df["areaWithType"].map({
    'Carpet Area' : 1,
    'Super Area' : 2}).fillna(0).astype(int)

In [31]:
df["area_sqft"] = df["square_feet"].str.extract(r'(\d+)').astype(float)

In [32]:
df["transaction_type"] = df["transaction"].map({
    'New Property' : 1,
    'Resale' : 2}).fillna(0).astype(int)

In [33]:
df["ready_to_move"] = df["status"].apply(lambda x: 1 if 'Ready to Move' in str(x) else 0)

In [34]:
df["furnishing_level"] = df["furnishing"].map({
    'Unfurnished' : 0,
    'Semi-Furnished' : 1,
    'Fully Furnished' : 2}).fillna(0).astype(int)

In [35]:
df["facing"] = df["facing"].astype('category').cat.codes

In [36]:
df['price_per_sqft'] = df['price_per_sqft'].str.replace('₹', '', regex=False)\
                                           .str.replace('per sqft', '', regex=False)\
                                           .str.replace(',', '', regex=False)\
                                           .astype(float)

In [37]:
df.head()

Unnamed: 0,property_name,areaWithType,square_feet,transaction,status,floor,furnishing,facing,description,price_per_sqft,price,BHK,Location,area_type,area_sqft,transaction_type,ready_to_move,furnishing_level
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644 sqft,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,173,"Luxury project with basement parking, Solar ro...",2891.0,₹33.8 Lac,2.0,Dindoli Surat,1,644.0,1,0,0
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278 sqft,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,156,2 And 3 BHK Luxurious Flat for Sell In New Alt...,3551.0,₹45.4 Lac,2.0,Althan Surat,2,1278.0,1,0,0
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173 sqft,Resale,Ready to Move,5 out of 13,Semi-Furnished,54,This affordable 2 BHK flat is situated along a...,3800.0,₹44.6 Lac,2.0,Pal Gam Surat,2,1173.0,2,1,1
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700 sqft,New Property,Ready to Move,6 out of 14,Unfurnished,54,2 BHK Flat For sell IN Jahangirabad Prime Loca...,3966.0,₹47 Lac,2.0,Jahangirabad Surat,1,700.0,1,1,0
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250 sqft,Orchid Fantasia,New Property,Unfurnished,2,12,"Multistorey Apartment for Sale in Palanpur, Su...",3600.0,₹45 Lac,2.0,,2,1250.0,0,0,0


In [38]:
def convert_price(p):
    if pd.isna(p):
        return np.nan
    p = p.replace('₹', '').replace(',', '').strip()
    if 'Lac' in p:
        return float(p.replace('Lac', '').strip()) * 1e5
    elif 'Cr' in p:
        return float(p.replace('Cr', '').strip()) * 1e7
    else:
        try:
            return float(p)
        except:
            return np.nan

In [39]:
df['price_rs'] = df['price'].apply(convert_price)

In [41]:
df =  df.drop(columns=['property_name','areaWithType','square_feet','transaction','status','floor','furnishing','description','price'])

In [42]:
df = df.dropna()

In [43]:
df 

Unnamed: 0,facing,price_per_sqft,BHK,Location,area_type,area_sqft,transaction_type,ready_to_move,furnishing_level,price_rs
0,173,2891.0,2.0,Dindoli Surat,1,644.0,1,0,0,3380000.0
1,156,3551.0,2.0,Althan Surat,2,1278.0,1,0,0,4540000.0
2,54,3800.0,2.0,Pal Gam Surat,2,1173.0,2,1,1,4460000.0
3,54,3966.0,2.0,Jahangirabad Surat,1,700.0,1,1,0,4700000.0
9,54,3200.0,2.0,Palanpur Surat,1,720.0,1,0,0,4000000.0
...,...,...,...,...,...,...,...,...,...,...
4372,54,6000.0,4.0,Vesu Surat,2,3600.0,1,0,0,21600000.0
4379,54,6000.0,4.0,Vesu Surat,2,3700.0,1,0,0,22200000.0
4386,54,6000.0,4.0,Vesu Surat,2,4600.0,1,0,0,27600000.0
4392,54,6000.0,4.0,Vesu Surat,2,3550.0,1,0,0,21300000.0


In [44]:
df.to_csv("cleaned_data.csv",index = False)