In [None]:
import pandas as pd
import re

In [None]:
df = pd.read_csv('All_Home_Details.csv')

In [None]:
df.head()

Unnamed: 0,sub_location,parent_location,bed_rooms,bath_rooms,house_size,land_size,description,price
0,"Colombo 5,",Colombo,4,3,"1,500.0 sqft",13.0 perches,13 Perches - House for Sale in Colombo 05 Col...,"Rs 100,000,000"
1,"Colombo 5,",Colombo,7,7,"4,500.0 sqft",11.8 perches,Prime Located 3 Storied House to be Sold - Han...,"Rs 190,000,000"
2,"Colombo 5,",Colombo,4,3,"2,000.0 sqft",10.0 perches,"Four bedroom, double storey house with 3 en-su...","Rs 135,000,000"
3,"Colombo 5,",Colombo,5,4,"3,000.0 sqft",11.5 perches,Very few options like this on the market off D...,"Rs 207,000,000"
4,"Colombo 5,",Colombo,4,4,"3,000.0 sqft",12.9 perches,Two-Story House For Sale In Colombo 05 Locate...,"Rs 100,000,000"


In [None]:
df.shape

(25888, 8)

In [None]:
# Function to preprocess the description
def preprocess_description(text):
    # Normalize case to lower for consistency
    text = text.lower()

    # Remove special characters except for full stops, preserve intra-word spaces
    text = re.sub(r'[^\w\s\.]', '', text)  # \w matches any alphanumeric character, \s is for spaces, \. is for full stops

    # Remove multiple consecutive spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
# Apply preprocessing to the 'description' column
df['description'] = df['description'].astype(str).apply(preprocess_description)

In [None]:
df.head()

Unnamed: 0,sub_location,parent_location,bed_rooms,bath_rooms,house_size,land_size,description,price
0,"Colombo 5,",Colombo,4,3,"1,500.0 sqft",13.0 perches,13 perches house for sale in colombo 05 colomb...,"Rs 100,000,000"
1,"Colombo 5,",Colombo,7,7,"4,500.0 sqft",11.8 perches,prime located 3 storied house to be sold handu...,"Rs 190,000,000"
2,"Colombo 5,",Colombo,4,3,"2,000.0 sqft",10.0 perches,four bedroom double storey house with 3 ensuit...,"Rs 135,000,000"
3,"Colombo 5,",Colombo,5,4,"3,000.0 sqft",11.5 perches,very few options like this on the market off d...,"Rs 207,000,000"
4,"Colombo 5,",Colombo,4,4,"3,000.0 sqft",12.9 perches,twostory house for sale in colombo 05 located ...,"Rs 100,000,000"


In [None]:
df.drop_duplicates(inplace=True)
df.shape

(17480, 8)

In [None]:
duplicate_count = df.duplicated().sum()
print("Number of duplicate rows:", duplicate_count)

Number of duplicate rows: 0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17480 entries, 0 to 25887
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   sub_location     17480 non-null  object
 1   parent_location  17480 non-null  object
 2   bed_rooms        17480 non-null  object
 3   bath_rooms       17480 non-null  object
 4   house_size       17480 non-null  object
 5   land_size        17480 non-null  object
 6   description      17480 non-null  object
 7   price            17480 non-null  object
dtypes: object(8)
memory usage: 1.2+ MB


**Identifying Most Used Words in the Description Column**

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter
import re

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    return tokens

# Apply preprocessing to each description
tokenized_descriptions = df['description'].apply(preprocess_text)

# Flatten the list of lists into a single list of tokens
all_tokens = [token for sublist in tokenized_descriptions for token in sublist]

# Count the frequency of each token
token_counts = Counter(all_tokens)

# Most common tokens
most_common_tokens = token_counts.most_common(6500)

most_common_tokens

[('area', 29668),
 ('house', 27834),
 ('2', 22956),
 ('road', 19303),
 ('3', 18727),
 ('water', 16749),
 ('5', 16705),
 ('bedrooms', 15412),
 ('bathroom', 14578),
 ('1', 14519),
 ('4', 14335),
 ('living', 13471),
 ('price', 13330),
 ('room', 13078),
 ('bathrooms', 12435),
 ('pantry', 12112),
 ('land', 11871),
 ('sale', 11362),
 ('perches', 11233),
 ('kitchen', 11143),
 ('floor', 11052),
 ('parking', 9677),
 ('dining', 9587),
 ('gate', 9542),
 ('property', 9520),
 ('km', 8714),
 ('garden', 8706),
 ('rooms', 8409),
 ('large', 8312),
 ('bedroom', 8035),
 ('negotiable', 7760),
 ('hot', 7643),
 ('space', 7338),
 ('20', 7175),
 ('modern', 7166),
 ('town', 6944),
 ('million', 6878),
 ('sqft', 6841),
 ('10', 6838),
 ('residential', 6820),
 ('close', 6719),
 ('attached', 6564),
 ('genuine', 6552),
 ('spacious', 6413),
 ('roller', 6274),
 ('brokers', 6209),
 ('junction', 6139),
 ('luxury', 5987),
 ('available', 5927),
 ('buyers', 5869),
 ('bed', 5811),
 ('piliyandala', 5649),
 ('ac', 5568),
 ('d

In [None]:
# Assuming most_common_tokens is a list of tuples like: [('word1', count1), ('word2', count2), ...]
for word, count in most_common_tokens:
    print(word, count)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
aluminum 70
panglobalproperty 70
remain 70
parks 70
required 70
pipeborn 70
reading 70
respectable 70
perchesa 70
380 70
adjoining 70
valuation 70
sofa 70
baby 70
order 70
matara 70
deman 70
recently 69
anderson 69
centre 69
44 69
crafted 69
listing 69
fataftic 69
grills 69
cubicle 69
stop 69
shop 69
model 69
deal 69
415 69
y 69
kadirana 69
kirulapana 68
wifi 68
taps 68
750m 68
alumex 68
energy 68
78 68
sales 68
express 68
kalubovila 68
ensure 68
cooking 68
facilitiesyes 68
papiliyana 68
scottish 68
manual 67
avissawella 67
collage 67
staff 67
chartered 67
suite 67
teaching 67
google 67
stock 67
teek 67
engineering 67
temples 67
attidiya 67
spans 66
appoinments 66
levels 66
inviting 66
kobbakaduwa 66
goverment 66
carport 66
designs 66
varandah 66
adds 65
italian 65
underground 65
sprawling 65
red 65
fitting 65
79 65
fridge 65
rout 65
thunadahena 65
read 65
vechile 65
ect 65
durability 65
silence 65
powder 64
avenue 64
giv

**Identifying the Availability  of Special Home Amenities in Each Home**

In [None]:
# Function to check for the presence of keyword groups
def check_keyword_group_presence(description, keyword_group):
    for keyword in keyword_group:
        # Check if the keyword is present in the description
        if keyword in description:
            return True
    return False

In [None]:
# List of keyword groups
keywords = {
    'pantry': ['pantry', 'cupboa', 'pantries', 'pantr', 'copboa'],
    'roller_gate': ['roller', 'rollar', 'roler', 'elcardo', 'rollershuttar', 'rollersutter', 'rollershutter', 'shutter'],
    'cctv_security': ['cctv', 'c c t v', 'camera', 'camara'],
    'water_heaters': ['hot', 'hotwater', 'heater'],
    'servant_facility': ['servant', 'servent', 'serven', 'servan'],
    'attached_bath': ['atach bath', 'atachbath', 'attach bath', 'attachbath', 'attached bath', 'attachedbath', 'atached bath', 'atachedbath'],
    'air_conditioned': ['ac prov', 'air cond', 'aircond', 'ac '],
    'solar_panel': ['solar', 'soler', 'solarpowered', 'sola', 'solarpan', 'solar panel'],
    'pool': ['pool', 'swimming', 'swim'],
    'generator': ['generator'],
    'jacuzzi': ['jacu'],

    'parking_features': ['parking', 'carpark', 'carpar', 'parkin', 'porch', 'pouch', 'poach', 'garage', 'parked', 'car porch', 'car park'],
    'garden': ['garden', 'gardan', 'gard'],

    'commercial_area': ['close', 'junction', 'bank', 'minutes', 'highway', 'distance', 'facing', '1k', '1km', '2k', '2km', '3k', '3km', 'school', 'walking',
                        'banks', 'supermarkets', 'hospital', 'miniths', 'cargil', 'airport', 'citysupermarket', 'college', 'minute', 'nearest', 'locationcitydistance',
                        'commercial', 'mins', 'restaurant', 'hotels', 'market', 'supermarket', 'keels', 'accessibility', 'university', 'jogging', 'hospitalsfood', 'schoolssupermarket',
                        'campus', 'horizon', 'expressway', 'walk', 'fuel', 'keells', 'atms', 'closer', 'pharmacy', 'shopping', 'transportation', 'walkin', 'parliment', 'arpico', 'accessible',
                        'hemas', 'nsbm', 'institution', 'tourist', 'gym', 'hospitel', 'distence', 'cargiils', 'pizza', 'foodcity', 'pharmacies', 'sea', 'express', 'minuets', 'hotel', 'cinec',
                        'quickly', 'distances',],

}

In [None]:
# Create a column for each keyword group to mark its presence
for feature_name, keyword_group in keywords.items():
    # The lambda function checks if any of the keywords are present in the description
    df[feature_name] = df['description'].apply(
    lambda desc: check_keyword_group_presence(str(desc), keyword_group)
    )

In [None]:
df.head()

Unnamed: 0,sub_location,parent_location,bed_rooms,bath_rooms,house_size,land_size,description,price,pantry,roller_gate,...,servant_facility,attached_bath,air_conditioned,solar_panel,pool,generator,jacuzzi,parking_features,garden,commercial_area
0,"Colombo 5,",Colombo,4,3,"1,500.0 sqft",13.0 perches,13 perches house for sale in colombo 05 colomb...,"Rs 100,000,000",False,False,...,False,False,False,False,False,False,False,True,True,True
1,"Colombo 5,",Colombo,7,7,"4,500.0 sqft",11.8 perches,prime located 3 storied house to be sold handu...,"Rs 190,000,000",True,True,...,True,True,False,True,False,False,False,True,True,True
2,"Colombo 5,",Colombo,4,3,"2,000.0 sqft",10.0 perches,four bedroom double storey house with 3 ensuit...,"Rs 135,000,000",False,False,...,False,False,False,False,False,False,False,False,True,True
3,"Colombo 5,",Colombo,5,4,"3,000.0 sqft",11.5 perches,very few options like this on the market off d...,"Rs 207,000,000",False,False,...,False,False,False,False,False,False,False,False,False,True
4,"Colombo 5,",Colombo,4,4,"3,000.0 sqft",12.9 perches,twostory house for sale in colombo 05 located ...,"Rs 100,000,000",True,True,...,True,False,True,False,False,False,False,True,True,True


In [None]:
# List of Boolean columns
bool_columns = ['pantry', 'roller_gate', 'cctv_security', 'water_heaters', 'servant_facility', 'attached_bath',
                'air_conditioned', 'solar_panel', 'pool', 'generator', 'jacuzzi',
                'parking_features']

# Create a new column with the count of 'True' values across the Boolean columns
df['true_count'] = df[bool_columns].sum(axis=1)

**Categorizing Homes into 3 Luxury Levels using the Count of Special Home Amenities (Consider all the Special Home Amenities except "garden" and "commercial area") in Each Home.**
1. Basic
2. Semi Luxury
3. Luxury

In [None]:
# Define the columns to exclude from the count
exclude_columns = ['garden', 'commercial_area']

# First, we get the list of columns that are boolean and not in the exclude list
# For demonstration purposes, we will list all possible feature columns
# In practice, you would generate this list programmatically if there are many columns
boolean_columns = ['pantry', 'roller_gate', 'cctv_security', 'water_heaters', 'servant_facility', 'attached_bath', 'air_conditioned', 'solar_panel', 'pool', 'generator', 'jacuzzi',
                   'parking_features']

# Now we remove the columns we want to exclude
columns_to_count = [col for col in boolean_columns if col not in exclude_columns]

# Then we sum up the True values across these columns only
df['true_count'] = df[columns_to_count].sum(axis=1)

# Define a function to categorize the luxury level
def categorize_luxury_level(true_count):
    if true_count < 5:
        return 'basic'
    elif 5 <= true_count <= 8:
        return 'semi luxury'
    elif 9 <= true_count <= 12:
        return 'luxury'
    else:
        return ''  # For counts that do not match any category

# Apply the function to the 'true_count' column to create a new 'luxury_level' column
df['luxury_level'] = df['true_count'].apply(categorize_luxury_level)

# Now df has a new column 'luxury_level' with the luxury level of each home


In [None]:
# Calculate the percentage of data for each luxury level
luxury_level_counts = df['luxury_level'].value_counts(normalize=True) * 100

# Print out the percentages
print(luxury_level_counts)

luxury_level
luxury         56.481693
semi luxury    30.532037
basic          12.986270
Name: proportion, dtype: float64


In [None]:
df.head()

Unnamed: 0,sub_location,parent_location,bed_rooms,bath_rooms,house_size,land_size,description,price,pantry,roller_gate,...,air_conditioned,solar_panel,pool,generator,jacuzzi,parking_features,garden,commercial_area,true_count,luxury_level
0,Colombo 5,Colombo,4,3,"1,500.0 sqft",13.0 perches,13 perches house for sale in colombo 05 colomb...,100000000.0,True,True,...,True,False,True,False,True,True,True,True,9,luxury
1,Colombo 5,Colombo,7,7,"4,500.0 sqft",11.8 perches,prime located 3 storied house to be sold handu...,190000000.0,True,True,...,True,True,False,False,False,True,True,True,9,luxury
2,Colombo 5,Colombo,4,3,"2,000.0 sqft",10.0 perches,four bedroom double storey house with 3 ensuit...,135000000.0,False,True,...,True,True,False,True,True,True,True,True,9,luxury
3,Colombo 5,Colombo,5,4,"3,000.0 sqft",11.5 perches,very few options like this on the market off d...,207000000.0,True,False,...,True,True,False,True,True,False,False,True,9,luxury
4,Colombo 5,Colombo,4,4,"3,000.0 sqft",12.9 perches,twostory house for sale in colombo 05 located ...,100000000.0,True,True,...,True,True,False,False,True,True,True,True,9,luxury


In [None]:
# Columns to drop
columns_to_drop = ['parent_location', 'description', 'pantry', 'roller_gate', 'cctv_security', 'water_heaters', 'servant_facility',
                   'attached_bath', 'air_conditioned', 'solar_panel', 'pool', 'generator',
                   'jacuzzi', 'parking_features', 'true_count']

# Drop the specified columns
df = df.drop(columns=columns_to_drop)

In [None]:
df.drop_duplicates(inplace=True)
df.shape

(14207, 9)

In [None]:
duplicate_count = df.duplicated().sum()
print("Number of duplicate rows:", duplicate_count)

Number of duplicate rows: 0


In [None]:
df.head()

Unnamed: 0,sub_location,bed_rooms,bath_rooms,house_size,land_size,price,garden,commercial_area,luxury_level
0,Colombo 5,4,3,"1,500.0 sqft",13.0 perches,100000000.0,True,True,luxury
1,Colombo 5,7,7,"4,500.0 sqft",11.8 perches,190000000.0,True,True,luxury
2,Colombo 5,4,3,"2,000.0 sqft",10.0 perches,135000000.0,True,True,luxury
3,Colombo 5,5,4,"3,000.0 sqft",11.5 perches,207000000.0,False,True,luxury
4,Colombo 5,4,4,"3,000.0 sqft",12.9 perches,100000000.0,True,True,luxury
