In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [19]:
df=pd.read_csv('properties.csv')
df.sample(5)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing
552,"1 BHK Flat for Sale in Jheel Tower, Jheel Towe...",Kolkata,"Jheel Road, Kolkata",1.0,26.5 Lac,,350.0,7571.0,Furnished
1394,"2 BHK Flat for Rent in Rishi Pranaya, Rishi Pr...",Kolkata,"Rajarhat, Kolkata",2.0,,28000.0,954.0,29.0,Semi-Furnished
112,"2 BHK Flat for Sale in Baranagar Bazar, Kolkata",Kolkata,"Baranagar Bazar, Kolkata",2.0,35.5 Lac,,563.0,6306.0,Furnished
228,"3 BHK Flat for Sale in Jadavpur, Kolkata",Kolkata,"Jadavpur, Kolkata",3.0,55 Lac,,1083.0,5078.0,Semi-Furnished
1300,3 BHK Flat for Rent in Associated Erectors Gre...,Kolkata,"Rajarhat, Kolkata",3.0,,20000.0,800.0,25.0,Semi-Furnished


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            1764 non-null   object 
 1   City            1764 non-null   object 
 2   Address         1764 non-null   object 
 3   Bedrooms        1764 non-null   Int64  
 4   Price           987 non-null    Int64  
 5   Rent            777 non-null    Int64  
 6   Area            1764 non-null   float64
 7   Per_Sqft_Price  1764 non-null   float64
 8   Furnishing      1764 non-null   object 
 9   Type            1764 non-null   object 
dtypes: Int64(3), float64(2), object(5)
memory usage: 143.1+ KB


In [21]:
def convert_price(text):
    if pd.isna(text):
        return np.nan

    text = str(text).strip()

    if "Lac" in text:
        num = float(text.replace("Lac", "").strip())
        return num * 100000

    if "Cr" in text:
        num = float(text.replace("Cr", "").strip())
        return num * 10000000
    
    try:
        return float(text.replace(",", ""))
    except:
        return np.nan


df["Price"] = df["Price"].apply(convert_price)
df["Rent"] = df["Rent"].apply(convert_price)

df["Price"] = df["Price"].round().astype("Int64")
df["Rent"]  = df["Rent"].round().astype("Int64")


df[["Price", "Rent"]].sample(2)

Unnamed: 0,Price,Rent
26,8540000.0,
1452,,13000.0


In [22]:
df["Bedrooms"]=df["Bedrooms"].astype("Int64")

In [30]:
df["Type"] = df["Price"].notna().map({True: "buy", False: "rent"})

In [34]:
# Remove rows where Area is null or Bedrooms is null
df = df.dropna(subset=['Area', 'Bedrooms'])

# Fill null Per_Sqft_Price values
# For rows with null Per_Sqft_Price, calculate it using (Price or Rent) / Area
def calculate_per_sqft(row):
    if pd.notna(row['Per_Sqft_Price']):
        return row['Per_Sqft_Price']
    
    # Use Price if available, otherwise use Rent
    amount = row['Price'] if pd.notna(row['Price']) else row['Rent']
    
    # Calculate per sqft if amount and area are available
    if pd.notna(amount) and row['Area'] > 0:
        return round(amount / row['Area'], 2)
    
    return None

df['Per_Sqft_Price'] = df.apply(calculate_per_sqft, axis=1)

df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Per_Sqft_Price'] = df.apply(calculate_per_sqft, axis=1)


In [130]:
df.sample(3)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing,Type
649,"1 BHK Flat for Sale in Kabardanga, Kolkata",Kolkata,Kabardanga,1,2100000.0,,485.0,4330.0,Unfurnished,buy
930,"2 BHK Flat for Sale in Garia, Kolkata",Kolkata,Garia,2,3380000.0,,750.0,4500.0,Unfurnished,buy
1188,"2 BHK Flat for Rent in Chinar Park, Rajarhat, ...",Kolkata,Chinar Park,2,,19000.0,500.0,38.0,Furnished,rent


In [38]:
df[["Address"]].to_clipboard(index=False)


In [40]:
import re

df = df.dropna(subset=['Address'])

df['Address'] = df['Address'].str.replace(r',\s*Kolkata', '', regex=True)

# Step 3: Strip any extra whitespace
df['Address'] = df['Address'].str.strip()

df.reset_index(drop=True, inplace=True)

print(f"Final dataframe shape: {df.shape}")

Final dataframe shape: (1764, 10)


In [131]:
df.to_csv('properties_final.csv',index=False)

In [133]:
import pandas as pd
# Define the column types for pd.read_csv
dtype_spec = {
    'Bedrooms': 'Int64',  # Nullable Integer
    'Price': 'Int64',     # Nullable Integer
    'Rent': 'Int64',      # Nullable Integer
}

# Define the missing value markers used in your CSV
missing_values = ['<NA>', 'NaN']

df = pd.read_csv(
    'properties_final.csv',
    dtype=dtype_spec,
    na_values=missing_values,   
)

In [145]:
df.sample(3)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing,Type
371,1 BHK Flat for Sale in Independent house nakta...,Kolkata,Naktala,1,1100000.0,,340.0,3235.0,Unfurnished,buy
1148,"3 BHK Flat for Rent in East Kolkata Township, ...",Kolkata,East Kolkata Township,3,,30000.0,950.0,32.0,Semi-Furnished,rent
912,"4 BHK Flat for Sale in Srijan Natura, Srijan N...",Kolkata,New Alipore,4,23100000.0,,1552.0,14884.0,Unfurnished,buy
