---   
 <img align="left" width="50" height="50"  src="https://upload.wikimedia.org/wikipedia/en/c/c8/University_of_the_Punjab_logo.png"> 

<h1 align="center">House Price Prediction</h1>
<h1 align="center">Advanced Machine Learning</h1>

---
<h3><div align="right">Maida Nadeem. (MSDSF22M001)</div></h3>  

## Data Cleansing

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Raw data

In [21]:
df =pd.read_csv('House_Prices_v3.csv')
print(df.shape)
df.head()

(780, 5)


Unnamed: 0,Address,Bedroom,Bathroom,Area,Price
0,Johar Town,4,5,10 Marla,3.48 crore
1,Al Rehman Garden,5,6,10 Marla,3.5 crore
2,Al Rehman Garden,5,6,10 Marla,3.5 crore
3,Johar Town,4,5,10 Marla,3.41 crore
4,Al Rehman Garden,5,6,10 Marla,3.5 crore


## Removing Duplicates from Raw Data

In [22]:
df = df.drop_duplicates(keep='first')
df.shape

(324, 5)

In [23]:
len(df['Address'].unique())

38

In [24]:
len(df['Area'].unique())

21

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 324 entries, 0 to 642
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Address   324 non-null    object
 1   Bedroom   324 non-null    int64 
 2   Bathroom  324 non-null    int64 
 3   Area      324 non-null    object
 4   Price     324 non-null    object
dtypes: int64(2), object(3)
memory usage: 15.2+ KB


## Converting Area into Numerical Scale of Marlas

In [26]:
df[['Area', 'Unit']] = df['Area'].str.split(n=1, expand=True)
df.head()

Unnamed: 0,Address,Bedroom,Bathroom,Area,Price,Unit
0,Johar Town,4,5,10,3.48 crore,Marla
1,Al Rehman Garden,5,6,10,3.5 crore,Marla
3,Johar Town,4,5,10,3.41 crore,Marla
5,Johar Town,4,5,10,3.53 crore,Marla
7,Johar Town,4,5,10,3.76 crore,Marla


In [27]:
df['Unit'].value_counts()

Marla    281
Kanal     41
Sqft       2
Name: Unit, dtype: int64

In [28]:
conversion_factors = {
    'Marla': 1,  
    'Sqft': 0.0036,  
    'Kanal': 20 
}
def convert_to_marlas(row):
    return float(row['Area']) * conversion_factors[row['Unit']]
df['Marlas'] = df.apply(convert_to_marlas, axis=1)
df.drop(['Area', 'Unit'], axis=1, inplace=True)
df.rename(columns={"Marlas":"Area(Marlas)"}, inplace=True)

In [29]:
df.sample(15)

Unnamed: 0,Address,Bedroom,Bathroom,Price,Area(Marlas)
105,DHA,5,5,11.7 crore,20.0
502,Tariq Garden,4,5,3.86 crore,10.0
479,Tariq Garden,4,5,4.03 crore,10.0
1,Al Rehman Garden,5,6,3.5 crore,10.0
312,Shad Bagh,4,3,1.5 crore,5.0
109,Bahria Town,4,4,3.8 crore,10.0
339,Johar Town,3,4,2.68 crore,5.0
315,DHA,5,5,12.5 crore,20.0
379,Johar Town,3,4,2.46 crore,5.0
48,DHA,4,5,3.62 crore,10.0


## Converting Price into Numerical Scale of Crore

In [30]:
df[['Price', 'Unit']] = df['Price'].str.split(n=1, expand=True)
df.head()

Unnamed: 0,Address,Bedroom,Bathroom,Price,Area(Marlas),Unit
0,Johar Town,4,5,3.48,10.0,crore
1,Al Rehman Garden,5,6,3.5,10.0,crore
3,Johar Town,4,5,3.41,10.0,crore
5,Johar Town,4,5,3.53,10.0,crore
7,Johar Town,4,5,3.76,10.0,crore


In [31]:
df['Unit'].value_counts()

crore    314
lac        9
Name: Unit, dtype: int64

In [32]:
unit_to_conversion = {'crore': 1, 'lac': 0.01}
def convert_to_lac(row):
    return float(row['Price']) * unit_to_conversion.get(row['Unit'], 1)
df['Price(Crore)'] = df.apply(convert_to_lac, axis=1)
df = df.drop(['Price', 'Unit'], axis=1)
df.sample(20)

Unnamed: 0,Address,Bedroom,Bathroom,Area(Marlas),Price(Crore)
276,Bahria Town,4,5,10.0,3.81
567,Johar Town,4,5,10.0,3.92
341,Johar Town,4,5,10.0,3.77
478,Tariq Garden,4,5,10.0,4.05
315,DHA,5,5,20.0,12.5
261,Bahria Town,4,5,10.0,3.94
624,Faisal Town,4,5,10.0,5.8
423,Wapda Town,4,5,10.0,3.85
365,Johar Town,4,5,10.0,3.57
514,Aashiana E Quaid Housing Scheme,2,2,3.0,0.7


## Removing Addresses with Single Instance
#### As they can cause Issue in Train Test Split and Preprocessing

In [33]:
value_counts = df['Address'].value_counts()
mask = df['Address'].isin(value_counts.index[value_counts > 2])
df = df[mask]
df.shape

(295, 5)

In [34]:
df['Address'].value_counts()

DHA                 91
Johar Town          62
Bahria Town         51
Tariq Garden        34
Model Town          14
Wapda Town          10
Faisal Town         10
Al Rehman Garden     8
LDA Avenue           6
New Garden Town      6
Green City           3
Name: Address, dtype: int64

## Creating Cleaned Dataset CSV file

In [35]:
df.to_csv("House_Prices_cleaned.csv", index=False)
df = pd.read_csv("House_Prices_cleaned.csv")
df.sample(20)

Unnamed: 0,Address,Bedroom,Bathroom,Area(Marlas),Price(Crore)
135,DHA,6,6,20.0,16.0
8,LDA Avenue,5,6,10.0,3.25
151,Johar Town,4,5,10.0,3.9
92,Bahria Town,4,5,10.0,3.65
97,Bahria Town,4,5,10.0,3.58
264,DHA,4,5,20.0,7.5
81,Bahria Town,4,5,10.0,3.75
144,DHA,5,5,20.0,9.6
217,Tariq Garden,4,5,10.0,4.1
114,Bahria Town,4,5,10.0,3.93
