In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
df=pd.read_csv('properties.csv')
df.sample(5)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing
1529,2 BHK Builder Floor for Sale in Tollygunge Pha...,Kolkata,"Tollygunge Phari, Kolkata",2.0,50 Lac,,850.0,5882.0,Unfurnished
5095,"2 BHK Flat for Rent in Barasat, Kolkata",Kolkata,"Barasat, Kolkata",2.0,,13000.0,1000.0,13.0,Unfurnished
5039,2 BHK Flat for Rent in Kolkata,Kolkata,"Motilal Colony, Kolkata, Kolkata",2.0,,12000.0,750.0,16.0,Unfurnished
3471,"3 BHK Apartment for Sale in Premier Joy 98, Ba...",Kolkata,"Barrackpore Trunk Road, Kolkata",3.0,1.30 Cr,,1350.0,9630.0,Unfurnished
5165,4 BHK Villa for Rent in New Town Kolkata,Kolkata,"New Town, Kolkata",4.0,,30000.0,1872.0,16.0,Semi-Furnished


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6475 entries, 0 to 6474
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            6475 non-null   object 
 1   City            6475 non-null   object 
 2   Address         6090 non-null   object 
 3   Bedrooms        6455 non-null   float64
 4   Price           4306 non-null   object 
 5   Rent            2169 non-null   object 
 6   Area            6345 non-null   float64
 7   Per_Sqft_Price  6100 non-null   float64
 8   Furnishing      6475 non-null   object 
dtypes: float64(3), object(6)
memory usage: 455.4+ KB


In [4]:
def convert_price(text):
    if pd.isna(text):
        return np.nan

    text = str(text).strip()

    if "Lac" in text:
        num = float(text.replace("Lac", "").strip())
        return num * 100000

    if "Cr" in text:
        num = float(text.replace("Cr", "").strip())
        return num * 10000000
    
    try:
        return float(text.replace(",", ""))
    except:
        return np.nan


df["Price"] = df["Price"].apply(convert_price)
df["Rent"] = df["Rent"].apply(convert_price)

df["Price"] = df["Price"].round().astype("Int64")
df["Rent"]  = df["Rent"].round().astype("Int64")


df[["Price", "Rent"]].sample(2)

Unnamed: 0,Price,Rent
4099,15100000.0,
4816,,23000.0


In [5]:
df["Bedrooms"]=df["Bedrooms"].astype("Int64")

In [6]:
df["Type"] = df["Price"].notna().map({True: "buy", False: "rent"})

In [7]:
# Remove rows where Area is null or Bedrooms is null
df = df.dropna(subset=['Area','Bedrooms','Address','Per_Sqft_Price'])


# Fill null Per_Sqft_Price values
# For rows with null Per_Sqft_Price, calculate it using (Price or Rent) / Area
def calculate_per_sqft(row):
    if pd.notna(row['Per_Sqft_Price']):
        return row['Per_Sqft_Price']
    
    # Use Price if available, otherwise use Rent
    amount = row['Price'] if pd.notna(row['Price']) else row['Rent']
    
    # Calculate per sqft if amount and area are available
    if pd.notna(amount) and row['Area'] > 0:
        return round(amount / row['Area'], 2)
    
    return None

df['Per_Sqft_Price'] = df.apply(calculate_per_sqft, axis=1)

df.reset_index(drop=True, inplace=True)

In [8]:
df.sample(3)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing,Type
506,2 BHK Apartment for Sale in Santoshpur Kolkata,Kolkata,"Santoshpur, Kolkata",2,2800000,,500.0,5600.0,Unfurnished,buy
815,4 BHK Apartment for Sale in Kendriya Vihar Pha...,Kolkata,"Birati, Kolkata",4,10000000,,1500.0,6667.0,Semi-Furnished,buy
2446,"5 BHK Apartment for Sale in AKP Marinas Edge, ...",Kolkata,"North 24 Parganas, Kolkata",5,65400000,,2824.0,23159.0,Unfurnished,buy


In [18]:
df[["Address"]].to_clipboard(index=False)


In [9]:
import re

df = df.dropna(subset=['Address'])

df['Address'] = df['Address'].str.replace(r',\s*Kolkata', '', regex=True)

# Step 3: Strip any extra whitespace
df['Address'] = df['Address'].str.strip()

df.reset_index(drop=True, inplace=True)

print(f"Final dataframe shape: {df.shape}")

Final dataframe shape: (6062, 10)


In [10]:
df.to_csv('properties_final.csv',index=False)

In [59]:
import pandas as pd
# Define the column types for pd.read_csv
dtype_spec = {
    'Bedrooms': 'Int64',  # Nullable Integer
    'Price': 'Int64',     # Nullable Integer
    'Rent': 'Int64',      # Nullable Integer
}

# Define the missing value markers used in your CSV
missing_values = ['<NA>', 'NaN']

df = pd.read_csv(
    'properties_final.csv',
    dtype=dtype_spec,
    na_values=missing_values,   
)

In [60]:
df.sample(3)

Unnamed: 0,Name,City,Address,Bedrooms,Price,Rent,Area,Per_Sqft_Price,Furnishing,Type
2264,> 10 BHK House for Sale in Sarat Bose Road Kol...,Kolkata,Sarat Bose Road,10,26000000.0,,4000.0,6500.0,Furnished,buy
368,2 BHK Apartment for Sale in Batanagar Kolkata,Kolkata,Batanagar,2,2000000.0,,666.0,3003.0,Unfurnished,buy
4709,1 BHK House for Rent in Swiss Park Kolkata,Kolkata,Swiss Park,1,,8000.0,350.0,23.0,Semi-Furnished,rent


In [62]:
import pandas as pd
import numpy as np

# ======================================================
# 1. LOAD DATA
# ======================================================
df = pd.read_csv("properties_final.csv")

# ======================================================
# 2. CLEAN ADDRESS
# ======================================================
def clean_address(address):
    if pd.isna(address):
        return ''
    addr = str(address).lower()
    for ch in [',', '#', '&amp;', '39;']:
        addr = addr.replace(ch, ' ')
    return ' '.join(addr.split())

df['Address'] = df['Address'].apply(clean_address)

# ======================================================
# 3. USE ONLY VALID RENT DATA (FOR LEARNING)
# ======================================================
rent_df = df[
    (df['Rent'].notna()) &
    (df['Rent'] > 0) &
    (df['Area'] > 0) &
    (df['Address'] != '')
].copy()

rent_df['Rent_Per_Sqft'] = rent_df['Rent'] / rent_df['Area']

# ======================================================
# 4. REMOVE OUTLIERS USING IQR (ADDRESS-WISE)
# ======================================================
def iqr_filter(group):
    q1 = group['Rent_Per_Sqft'].quantile(0.25)
    q3 = group['Rent_Per_Sqft'].quantile(0.75)
    iqr = q3 - q1
    low = q1 - 1.5 * iqr
    high = q3 + 1.5 * iqr
    return group[(group['Rent_Per_Sqft'] >= low) & (group['Rent_Per_Sqft'] <= high)]

rent_df = (
    rent_df
    .groupby('Address', group_keys=False)
    .apply(iqr_filter)
)

# ======================================================
# 5. BUILD ADDRESS-LEVEL RENT/SQFT MODEL
# ======================================================
address_rps = (
    rent_df
    .groupby('Address')['Rent_Per_Sqft']
    .median()
)

# ======================================================
# 6. DELETE BUY ROWS WITH UNKNOWN ADDRESS
# ======================================================
df = df[df['Address'].isin(address_rps.index)].copy()

# ======================================================
# 7. ESTIMATE RENT (AREA ONLY, OVERWRITE Rent)
# ======================================================
def estimate_rent(row):
    if pd.isna(row['Area']) or row['Area'] <= 0:
        return np.nan
    return round(address_rps[row['Address']] * row['Area'])

df['Rent'] = df.apply(estimate_rent, axis=1).astype('Int64')

print("Final rows after strict filtering:", len(df))


Final rows after strict filtering: 5472


  .apply(iqr_filter)


In [69]:
df_buy=df[df['Type']=='buy']
df_rent=df[df['Type']=='rent']


In [70]:
df_buy = df_buy.drop(
    columns=['City', 'Match_Type', 'Per_Sqft_Price','Type'],
    errors='ignore'
)

In [73]:
df_buy.to_csv('kolkata.csv',index=False)