## Data Cleansing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [154]:
df =pd.read_csv('House_Prices_v3.csv')
print(df.shape)
df.head()

(780, 5)


Unnamed: 0,Address,Bedroom,Bathroom,Area,Price
0,Johar Town,4,5,10 Marla,3.48 crore
1,Al Rehman Garden,5,6,10 Marla,3.5 crore
2,Al Rehman Garden,5,6,10 Marla,3.5 crore
3,Johar Town,4,5,10 Marla,3.41 crore
4,Al Rehman Garden,5,6,10 Marla,3.5 crore


In [155]:
df = df.drop_duplicates(keep='first')
df.shape

(324, 5)

In [156]:
len(df['Address'].unique())

38

In [157]:
len(df['Area'].unique())

21

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 0 to 642
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Address   324 non-null    object
 1   Bedroom   324 non-null    int64 
 2   Bathroom  324 non-null    int64 
 3   Area      324 non-null    object
 4   Price     324 non-null    object
dtypes: int64(2), object(3)
memory usage: 15.2+ KB


In [159]:
df[['Area', 'Unit']] = df['Area'].str.split(n=1, expand=True)
df.head()

Unnamed: 0,Address,Bedroom,Bathroom,Area,Price,Unit
0,Johar Town,4,5,10,3.48 crore,Marla
1,Al Rehman Garden,5,6,10,3.5 crore,Marla
3,Johar Town,4,5,10,3.41 crore,Marla
5,Johar Town,4,5,10,3.53 crore,Marla
7,Johar Town,4,5,10,3.76 crore,Marla


In [160]:
df['Unit'].value_counts()

Marla    281
Kanal     41
Sqft       2
Name: Unit, dtype: int64

In [161]:
conversion_factors = {
    'Marla': 1,  
    'Sqft': 0.0036,  
    'Kanal': 20 
}
def convert_to_marlas(row):
    return float(row['Area']) * conversion_factors[row['Unit']]
df['Marlas'] = df.apply(convert_to_marlas, axis=1)
df.drop(['Area', 'Unit'], axis=1, inplace=True)
df.rename(columns={"Marlas":"Area(Marlas)"}, inplace=True)

In [162]:
df.sample(15)

Unnamed: 0,Address,Bedroom,Bathroom,Price,Area(Marlas)
370,Johar Town,3,4,2.44 crore,5.0
110,New Garden Town,4,3,6.75 crore,20.0
121,Green City,5,6,4 crore,10.0
351,Johar Town,4,5,3.7 crore,10.0
100,DHA,4,5,3.26 crore,10.0
383,Johar Town,3,4,2.52 crore,5.0
61,DHA,4,5,3.46 crore,10.0
619,Model Town,4,5,8.3 crore,10.0
118,DHA,5,6,9.15 crore,20.0
426,Wapda Town,4,5,3.84 crore,10.0


In [163]:
df[['Price', 'Unit']] = df['Price'].str.split(n=1, expand=True)
df.head()

Unnamed: 0,Address,Bedroom,Bathroom,Price,Area(Marlas),Unit
0,Johar Town,4,5,3.48,10.0,crore
1,Al Rehman Garden,5,6,3.5,10.0,crore
3,Johar Town,4,5,3.41,10.0,crore
5,Johar Town,4,5,3.53,10.0,crore
7,Johar Town,4,5,3.76,10.0,crore


In [164]:
df['Unit'].value_counts()

crore    314
lac        9
Name: Unit, dtype: int64

In [165]:
unit_to_conversion = {'crore': 1, 'lac': 0.01}
def convert_to_lac(row):
    return float(row['Price']) * unit_to_conversion.get(row['Unit'], 1)
df['Price(Crore)'] = df.apply(convert_to_lac, axis=1)
df = df.drop(['Price', 'Unit'], axis=1)
df.sample(20)

Unnamed: 0,Address,Bedroom,Bathroom,Area(Marlas),Price(Crore)
305,Johar Town,3,3,3.0,1.3
191,Bahria Town,4,5,10.0,3.6
537,DHA,5,5,20.0,14.8
594,Johar Town,4,5,10.0,3.58
534,Faisal Town,4,5,10.0,7.0
383,Johar Town,3,4,5.0,2.52
435,Tariq Garden,4,5,10.0,3.76
172,Bahria Town,4,5,10.0,3.67
102,DHA,4,5,10.0,3.22
313,Al Hafeez Gardens,3,3,3.0,1.4


In [167]:
df.to_csv("House_Prices_cleaned.csv", index=False)
df = pd.read_csv("House_Prices_cleaned.csv")
df.sample(20)

Unnamed: 0,Address,Bedroom,Bathroom,Area(Marlas),Price(Crore)
1,Al Rehman Garden,5,6,10.0,3.5
22,DHA,4,5,10.0,3.5
218,DHA,6,6,20.0,11.5
113,Al Rehman Garden,5,6,10.0,3.1
317,Model Town,4,5,10.0,8.2
288,Faisal Town,4,5,10.0,6.5
123,Bahria Town,4,5,10.0,3.94
9,Al Rehman Garden,5,6,10.0,4.3
226,Bedian Road,3,3,3.0,0.92
148,Shad Bagh,4,3,5.0,1.5


In [169]:
pip install geopy

Collecting geopy
  Downloading geopy-2.4.0-py3-none-any.whl (125 kB)
     ------------------------------------ 125.4/125.4 kB 492.3 kB/s eta 0:00:00
Collecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
     ---------------------------------------- 40.3/40.3 kB 2.0 MB/s eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.0
Note: you may need to restart the kernel to use updated packages.


