In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [19]:
df=pd.read_csv('properties.csv')
df.sample(5)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing
552,"1 BHK Flat for Sale in Jheel Tower, Jheel Towe...",Kolkata,"Jheel Road, Kolkata",1.0,26.5 Lac,,350.0,7571.0,Furnished
1394,"2 BHK Flat for Rent in Rishi Pranaya, Rishi Pr...",Kolkata,"Rajarhat, Kolkata",2.0,,28000.0,954.0,29.0,Semi-Furnished
112,"2 BHK Flat for Sale in Baranagar Bazar, Kolkata",Kolkata,"Baranagar Bazar, Kolkata",2.0,35.5 Lac,,563.0,6306.0,Furnished
228,"3 BHK Flat for Sale in Jadavpur, Kolkata",Kolkata,"Jadavpur, Kolkata",3.0,55 Lac,,1083.0,5078.0,Semi-Furnished
1300,3 BHK Flat for Rent in Associated Erectors Gre...,Kolkata,"Rajarhat, Kolkata",3.0,,20000.0,800.0,25.0,Semi-Furnished


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            1764 non-null   object 
 1   City            1764 non-null   object 
 2   Address         1764 non-null   object 
 3   Bedrooms        1764 non-null   Int64  
 4   Price           987 non-null    Int64  
 5   Rent            777 non-null    Int64  
 6   Area            1764 non-null   float64
 7   Per_Sqft_Price  1764 non-null   float64
 8   Furnishing      1764 non-null   object 
 9   Type            1764 non-null   object 
dtypes: Int64(3), float64(2), object(5)
memory usage: 143.1+ KB


In [21]:
def convert_price(text):
    if pd.isna(text):
        return np.nan

    text = str(text).strip()

    if "Lac" in text:
        num = float(text.replace("Lac", "").strip())
        return num * 100000

    if "Cr" in text:
        num = float(text.replace("Cr", "").strip())
        return num * 10000000
    
    try:
        return float(text.replace(",", ""))
    except:
        return np.nan


df["Price"] = df["Price"].apply(convert_price)
df["Rent"] = df["Rent"].apply(convert_price)

df["Price"] = df["Price"].round().astype("Int64")
df["Rent"]  = df["Rent"].round().astype("Int64")


df[["Price", "Rent"]].sample(2)

Unnamed: 0,Price,Rent
26,8540000.0,
1452,,13000.0


In [22]:
df["Bedrooms"]=df["Bedrooms"].astype("Int64")

In [30]:
df["Type"] = df["Price"].notna().map({True: "buy", False: "rent"})

In [34]:
# Remove rows where Area is null or Bedrooms is null
df = df.dropna(subset=['Area', 'Bedrooms'])

# Fill null Per_Sqft_Price values
# For rows with null Per_Sqft_Price, calculate it using (Price or Rent) / Area
def calculate_per_sqft(row):
    if pd.notna(row['Per_Sqft_Price']):
        return row['Per_Sqft_Price']
    
    # Use Price if available, otherwise use Rent
    amount = row['Price'] if pd.notna(row['Price']) else row['Rent']
    
    # Calculate per sqft if amount and area are available
    if pd.notna(amount) and row['Area'] > 0:
        return round(amount / row['Area'], 2)
    
    return None

df['Per_Sqft_Price'] = df.apply(calculate_per_sqft, axis=1)

df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Per_Sqft_Price'] = df.apply(calculate_per_sqft, axis=1)


In [130]:
df.sample(3)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing,Type
649,"1 BHK Flat for Sale in Kabardanga, Kolkata",Kolkata,Kabardanga,1,2100000.0,,485.0,4330.0,Unfurnished,buy
930,"2 BHK Flat for Sale in Garia, Kolkata",Kolkata,Garia,2,3380000.0,,750.0,4500.0,Unfurnished,buy
1188,"2 BHK Flat for Rent in Chinar Park, Rajarhat, ...",Kolkata,Chinar Park,2,,19000.0,500.0,38.0,Furnished,rent


In [38]:
df[["Address"]].to_clipboard(index=False)


In [40]:
import re

df = df.dropna(subset=['Address'])

df['Address'] = df['Address'].str.replace(r',\s*Kolkata', '', regex=True)

# Step 3: Strip any extra whitespace
df['Address'] = df['Address'].str.strip()

df.reset_index(drop=True, inplace=True)

print(f"Final dataframe shape: {df.shape}")

Final dataframe shape: (1764, 10)


In [131]:
df.to_csv('properties_final.csv',index=False)

In [1]:
import pandas as pd
# Define the column types for pd.read_csv
dtype_spec = {
    'Bedrooms': 'Int64',  # Nullable Integer
    'Price': 'Int64',     # Nullable Integer
    'Rent': 'Int64',      # Nullable Integer
}

# Define the missing value markers used in your CSV
missing_values = ['<NA>', 'NaN']

df = pd.read_csv(
    'properties_final.csv',
    dtype=dtype_spec,
    na_values=missing_values,   
)

In [2]:
df.sample(3)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing,Type
1049,3 BHK Flat for Rent in Kolkata,Kolkata,near laketown,3,,22000.0,900.0,24.0,Semi-Furnished,rent
627,"2 BHK Flat for Sale in ATK Wood Winds, ATK Woo...",Kolkata,New Town,2,6200000.0,,781.0,7939.0,Semi-Furnished,buy
622,"3 BHK Flat for Sale in Akshay The Crown, Aksha...",Kolkata,Beleghata,3,14500000.0,,1000.0,14500.0,Unfurnished,buy


In [22]:
df_buy=df[df['Type']=='buy']
df_rent=df[df['Type']=='rent']


In [23]:
# Complete Code for Rent Estimation with Area consideration
import pandas as pd
import numpy as np

# Clean addresses
def clean_address(address):
    if pd.isna(address):
        return ''
    addr = str(address).lower().strip()
    addr = addr.replace(',', ' ').replace('&amp;', '').replace('#', '').replace('39;', '')
    addr = ' '.join(addr.split())
    return addr

df_buy['Address'] = df_buy['Address'].apply(clean_address)
df_rent['Address'] = df_rent['Address'].apply(clean_address)

# Filter valid rent data
df_rent_valid = df_rent[
    (df_rent['Rent'].notna()) & 
    (df_rent['Rent'] > 0) & 
    (df_rent['Bedrooms'].notna()) &
    (df_rent['Area'].notna()) &
    (df_rent['Area'] > 0) &
    (df_rent['Address'] != '')
].copy()

# Calculate rent per sqft for each rent property
df_rent_valid['Rent_Per_Sqft'] = df_rent_valid['Rent'] / df_rent_valid['Area']

# Build lookup dictionaries
# 1. Exact match: Address + Bedrooms + Area (±15%)
exact_match_dict = {}
for _, row in df_rent_valid.iterrows():
    key = (row['Address'], row['Bedrooms'], row['Area'])
    if key not in exact_match_dict:
        exact_match_dict[key] = []
    exact_match_dict[key].append(row['Rent'])

# 2. Address + Bedrooms (any area) - get rent per sqft
address_bedroom_dict = df_rent_valid.groupby(['Address', 'Bedrooms'])['Rent_Per_Sqft'].median().to_dict()

# 3. Address only - get rent per sqft
address_dict = df_rent_valid.groupby('Address')['Rent_Per_Sqft'].median().to_dict()

# 4. Bedrooms + Area range - get rent per sqft
bedroom_dict = df_rent_valid.groupby('Bedrooms')['Rent_Per_Sqft'].median().to_dict()

# 5. Overall rent per sqft
overall_rent_per_sqft = df_rent_valid['Rent_Per_Sqft'].median()

# Estimate rent function
def estimate_rent(row):
    address = row['Address']
    bedrooms = row['Bedrooms']
    area = row['Area']
    
    if pd.isna(bedrooms) or pd.isna(area) or area <= 0 or address == '':
        return None, 'no_data'
    
    # Priority 1: Exact match with similar area (±15%)
    for (addr, bed, rent_area), rents in exact_match_dict.items():
        if addr == address and bed == bedrooms:
            if 0.85 * area <= rent_area <= 1.15 * area:
                return np.median(rents), 'exact_match'
    
    # Priority 2: Same Address + Same Bedrooms (proportional to area)
    if (address, bedrooms) in address_bedroom_dict:
        rent_per_sqft = address_bedroom_dict[(address, bedrooms)]
        estimated_rent = rent_per_sqft * area
        return estimated_rent, 'address_bedroom'
    
    # Priority 3: Same Address only (proportional to area)
    if address in address_dict:
        rent_per_sqft = address_dict[address]
        estimated_rent = rent_per_sqft * area
        return estimated_rent, 'address_only'
    
    # Priority 4: Same Bedrooms (proportional to area)
    if bedrooms in bedroom_dict:
        rent_per_sqft = bedroom_dict[bedrooms]
        estimated_rent = rent_per_sqft * area
        return estimated_rent, 'bedroom_only'
    
    # Priority 5: City average (proportional to area)
    estimated_rent = overall_rent_per_sqft * area
    return estimated_rent, 'city_avg'

# Apply estimation
df_buy[['Rent', 'Match_Type']] = df_buy.apply(estimate_rent, axis=1, result_type='expand')
df_buy['Rent'] = df_buy['Rent'].round(0).astype('Int64')

# Results
print("Match Statistics:")
print(df_buy['Match_Type'].value_counts())

Match Statistics:
Match_Type
exact_match        339
bedroom_only       239
address_bedroom    213
address_only       194
city_avg             2
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_buy['Address'] = df_buy['Address'].apply(clean_address)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rent['Address'] = df_rent['Address'].apply(clean_address)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_buy[['Rent', 'Match_Type']] = df_buy.apply(estimate_rent, axis=1, result_type='exp

In [24]:
# Complete Code for Price Estimation for Rent properties
import pandas as pd
import numpy as np

# Clean addresses
def clean_address(address):
    if pd.isna(address):
        return ''
    addr = str(address).lower().strip()
    addr = addr.replace(',', ' ').replace('&amp;', '').replace('#', '').replace('39;', '')
    addr = ' '.join(addr.split())
    return addr

df_buy['Address'] = df_buy['Address'].apply(clean_address)
df_rent['Address'] = df_rent['Address'].apply(clean_address)

# Filter valid buy data
df_buy_valid = df_buy[
    (df_buy['Price'].notna()) & 
    (df_buy['Price'] > 0) & 
    (df_buy['Bedrooms'].notna()) &
    (df_buy['Area'].notna()) &
    (df_buy['Area'] > 0) &
    (df_buy['Address'] != '')
].copy()

# Calculate price per sqft for each buy property
df_buy_valid['Price_Per_Sqft'] = df_buy_valid['Price'] / df_buy_valid['Area']

# Build lookup dictionaries
# 1. Exact match: Address + Bedrooms + Area (±15%)
exact_match_dict = {}
for _, row in df_buy_valid.iterrows():
    key = (row['Address'], row['Bedrooms'], row['Area'])
    if key not in exact_match_dict:
        exact_match_dict[key] = []
    exact_match_dict[key].append(row['Price'])

# 2. Address + Bedrooms (any area) - get price per sqft
address_bedroom_dict = df_buy_valid.groupby(['Address', 'Bedrooms'])['Price_Per_Sqft'].median().to_dict()

# 3. Address only - get price per sqft
address_dict = df_buy_valid.groupby('Address')['Price_Per_Sqft'].median().to_dict()

# 4. Bedrooms only - get price per sqft
bedroom_dict = df_buy_valid.groupby('Bedrooms')['Price_Per_Sqft'].median().to_dict()

# 5. Overall price per sqft
overall_price_per_sqft = df_buy_valid['Price_Per_Sqft'].median()

# Estimate price function
def estimate_price(row):
    address = row['Address']
    bedrooms = row['Bedrooms']
    area = row['Area']
    
    if pd.isna(bedrooms) or pd.isna(area) or area <= 0 or address == '':
        return None, 'no_data'
    
    # Priority 1: Exact match with similar area (±15%)
    for (addr, bed, buy_area), prices in exact_match_dict.items():
        if addr == address and bed == bedrooms:
            if 0.85 * area <= buy_area <= 1.15 * area:
                return np.median(prices), 'exact_match'
    
    # Priority 2: Same Address + Same Bedrooms (proportional to area)
    if (address, bedrooms) in address_bedroom_dict:
        price_per_sqft = address_bedroom_dict[(address, bedrooms)]
        estimated_price = price_per_sqft * area
        return estimated_price, 'address_bedroom'
    
    # Priority 3: Same Address only (proportional to area)
    if address in address_dict:
        price_per_sqft = address_dict[address]
        estimated_price = price_per_sqft * area
        return estimated_price, 'address_only'
    
    # Priority 4: Same Bedrooms (proportional to area)
    if bedrooms in bedroom_dict:
        price_per_sqft = bedroom_dict[bedrooms]
        estimated_price = price_per_sqft * area
        return estimated_price, 'bedroom_only'
    
    # Priority 5: City average (proportional to area)
    estimated_price = overall_price_per_sqft * area
    return estimated_price, 'city_avg'

# Apply estimation
df_rent[['Price', 'Match_Type']] = df_rent.apply(estimate_price, axis=1, result_type='expand')
df_rent['Price'] = df_rent['Price'].round(0).astype('Int64')

# Results
print("Match Statistics:")
print(df_rent['Match_Type'].value_counts())
print("\nSample Results:")

Match Statistics:
Match_Type
exact_match        350
bedroom_only       175
address_bedroom    139
address_only       113
Name: count, dtype: int64

Sample Results:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_buy['Address'] = df_buy['Address'].apply(clean_address)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rent['Address'] = df_rent['Address'].apply(clean_address)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rent[['Price', 'Match_Type']] = df_rent.apply(estimate_price, axis=1, result_type=

In [34]:
df_buy['Type'] = 'buy'
df_rent['Type'] = 'rent'
df_combined = pd.concat([df_buy, df_rent], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_buy['Type'] = 'buy'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rent['Type'] = 'rent'


In [None]:
df_combined = df_combined.drop(columns=['City'])

In [37]:
# Recalculate per sqft columns correctly
df_combined['Price_Per_Sqft'] = (df_combined['Price'] / df_combined['Area']).round(2)
df_combined['Rent_Per_Sqft'] = (df_combined['Rent'] / df_combined['Area']).round(2)

# Drop old ambiguous column
df_combined = df_combined.drop(columns=['Per_Sqft_Price'])


In [42]:
df_combined.to_csv('kolkata.csv',index=False)