# Data Cleaning

### `Open Sooq` Data Cleaning

In [2]:
import numpy as np
import pandas as pd

In [18]:
df_opensooq = pd.read_csv(r"OpenSooq.csv")
df_opensooq.head()

Unnamed: 0,propert_title,propert_location,price,area,listing_type
0,120000 m2 3 Bedrooms Apartments for Rent in Dh...,"Dhofar, Salala",35 OMR,120000.0,For Rent
1,3 Bedrooms Chalet for Rent in Dhofar Salala,"Dhofar, Salala",,250.0,For Rent
2,10 m2 Studio Apartments for Rent in Muscat Amerat,"Muscat, Amerat",7 OMR,10.0,For Rent
3,Furnished Daily in Muscat Al Mawaleh,"Muscat, Al Mawaleh",8 OMR,,For Rent
4,للإيجار مكتب تجاري واجهة على شارع مزون - معرض...,"Muscat, Al Khoud",600 OMR,125.0,For Rent


In [19]:
df_opensooq.shape

(5460, 5)

In [20]:
df_opensooq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5460 entries, 0 to 5459
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   propert_title     5460 non-null   object 
 1   propert_location  5460 non-null   object 
 2   price             5352 non-null   object 
 3   area              4308 non-null   float64
 4   listing_type      5460 non-null   object 
dtypes: float64(1), object(4)
memory usage: 213.4+ KB


In [21]:
df_opensooq["price"] = (df_opensooq["price"].str.replace("OMR", "", regex = False).str.replace(",", "", regex = False).str.strip())
df_opensooq["price"] = pd.to_numeric(df_opensooq["price"], errors = "coerce")
df_opensooq[["price"]]

Unnamed: 0,price
0,35.0
1,
2,7.0
3,8.0
4,600.0
...,...
5455,275.0
5456,50.0
5457,120.0
5458,450.0


In [22]:
df_opensooq.isnull().sum()

propert_title          0
propert_location       0
price                108
area                1152
listing_type           0
dtype: int64

In [23]:
df_opensooq["price"].fillna(df_opensooq["price"].mode()[0], inplace=True)
df_opensooq["price"] = df_opensooq["price"].astype(str).str.split().str[0].str.replace(",", "").astype(float)
df_opensooq[["price"]]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_opensooq["price"].fillna(df_opensooq["price"].mode()[0], inplace=True)


Unnamed: 0,price
0,35.0
1,25.0
2,7.0
3,8.0
4,600.0
...,...
5455,275.0
5456,50.0
5457,120.0
5458,450.0


In [25]:
df_opensooq["area"] = df_opensooq["area"].ffill().bfill()

In [26]:
df_opensooq[["area"]]

Unnamed: 0,area
0,120000.0
1,250.0
2,10.0
3,10.0
4,125.0
...,...
5455,70.0
5456,70.0
5457,70.0
5458,300.0


In [27]:
df_opensooq.isnull().sum()

propert_title       0
propert_location    0
price               0
area                0
listing_type        0
dtype: int64

In [28]:
df_opensooq.duplicated().sum()

995

In [29]:
df_opensooq.drop_duplicates(keep='first', inplace=True)

In [51]:
df_opensooq.columns

Index(['propert_title', 'propert_location', 'price', 'area', 'listing_type'], dtype='object')

In [52]:
df_opensooq.rename(columns={"propert_title": "property_title","propert_location": "property_location"}, inplace=True)

### `Hilal` Data Cleaning

In [41]:
df_hilal = pd.read_csv(r"hilal.csv")
df_hilal.head()

Unnamed: 0,property_title,property_location,price,area,listing_type
0,2-BEDROOM APARTMENT,Al Ansab,300,,For Rent
1,2-BEDROOM APARTMENT,Shatti Al Qurum,500,,For Rent
2,4+1 BEDROOM TWIN VILLA,Madinat Qaboos (MQ),1500,,For Rent
3,COMMERCIAL SHOP,Ghala,370,39.0,For Rent
4,COMMERCIAL OFFICE SPACE,Bausher,4,,For Rent


In [42]:
df_hilal.shape

(173, 5)

In [43]:
df_hilal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   property_title     173 non-null    object
 1   property_location  167 non-null    object
 2   price              173 non-null    int64 
 3   area               67 non-null     object
 4   listing_type       173 non-null    object
dtypes: int64(1), object(4)
memory usage: 6.9+ KB


In [44]:
# Reload or convert the column back to string
df_hilal["price"] = df_hilal["price"].astype(str)

# Now clean dashes or unwanted characters
df_hilal["price"] = df_hilal["price"].str.replace("-", "", regex=False)

# Convert back to numeric
df_hilal["price"] = pd.to_numeric(df_hilal["price"], errors="coerce")
df_hilal[["price"]]

Unnamed: 0,price
0,300
1,500
2,1500
3,370
4,4
...,...
168,350
169,950
170,950
171,475


In [45]:
# Replace 'Unknown' with NaN
df_hilal["area"] = df_hilal["area"].replace("Unknown", np.nan)

# Handle entries like '94 / 72' by extracting the first number (before the slash)
df_hilal["area"] = df_hilal["area"].astype(str).str.extract(r"(\d+(?:\.\d+)?)")  # extract first number

# Step 3: Remove any remaining '-' signs
df_hilal["area"] = df_hilal["area"].str.replace("-", "", regex = False)

# Step 4: Convert to float
df_hilal["area"] = pd.to_numeric(df_hilal["area"], errors = "coerce")

# Fill NaN using forward and backward fill
df_hilal["area"] = df_hilal["area"].ffill().bfill()
df_hilal[["area"]]

Unnamed: 0,area
0,39.0
1,39.0
2,39.0
3,39.0
4,39.0
...,...
168,170.0
169,340.0
170,350.0
171,350.0


In [46]:
df_hilal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   property_title     173 non-null    object 
 1   property_location  167 non-null    object 
 2   price              173 non-null    int64  
 3   area               173 non-null    float64
 4   listing_type       173 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 6.9+ KB


In [47]:
df_hilal.isnull().sum()

property_title       0
property_location    6
price                0
area                 0
listing_type         0
dtype: int64

In [48]:
df_hilal.duplicated().sum()

3

In [49]:
df_hilal.drop_duplicates(keep='first', inplace=True)

In [53]:
real_estate_data = pd.concat([df_opensooq, df_hilal], ignore_index = True)
real_estate_data

Unnamed: 0,property_title,property_location,price,area,listing_type
0,120000 m2 3 Bedrooms Apartments for Rent in Dh...,"Dhofar, Salala",35.0,120000.0,For Rent
1,3 Bedrooms Chalet for Rent in Dhofar Salala,"Dhofar, Salala",25.0,250.0,For Rent
2,10 m2 Studio Apartments for Rent in Muscat Amerat,"Muscat, Amerat",7.0,10.0,For Rent
3,Furnished Daily in Muscat Al Mawaleh,"Muscat, Al Mawaleh",8.0,10.0,For Rent
4,للإيجار مكتب تجاري واجهة على شارع مزون - معرض...,"Muscat, Al Khoud",600.0,125.0,For Rent
...,...,...,...,...,...
4630,2 BEDROOM APARTMENT IN (SEEB),Seeb,350.0,170.0,For Rent
4631,4 BEDROOM RENOVATED DETACHED VILLA,Azaiba,950.0,340.0,For Rent
4632,6 BEDROOM DETACHED VILLA IN (SHATTI AL QURUM),Shatti Al Qurum,950.0,350.0,For Rent
4633,2 BEDROOM APARTMENT IN (BOSHER),Bausher,475.0,350.0,For Rent


In [54]:
real_estate_data.to_csv("RealEstate_Dataset.csv", index = False)

In [55]:
real_estate_data.shape

(4635, 5)