In [1]:
# Import necessary libraries
import pandas as pd # pd is user for data manipulation
import numpy as np # np is used for numerical operations
import matplotlib.pyplot as plt # plt is used for plotting


In [2]:
df=pd.read_csv('/home/jagdish/Desktop/31305/Dsbdl/A1/melb_data.csv')
df.head() # Display the  rows of the dataset

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


# Data Preprocessing

In [10]:
df.dtypes # Display the data types of each column
# df['Price'].dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method             int64
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car                int64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [11]:
df.isnull().sum() # Check for missing values in the dataset
# df.notnull().sum() # Check for non-missing values in the dataset

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [5]:
df.describe() # Display summary statistics of the dataset

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [6]:
df.shape # Display the shape of the dataset

(13580, 21)

In [7]:
for col in df.columns:
    if df[col].dtype=="object":
        print(f"{col}: categorical column")
    elif df[col].dtypes=="int64":
        print(f"{col}: integer column")
    elif df[col].dtypes=="float64": 
        print(f"{col}: float column")
    else :
        print(f"{col}: unknown column type")

Suburb: categorical column
Address: categorical column
Rooms: integer column
Type: categorical column
Price: float column
Method: categorical column
SellerG: categorical column
Date: categorical column
Distance: float column
Postcode: float column
Bedroom2: float column
Bathroom: float column
Car: float column
Landsize: float column
BuildingArea: float column
YearBuilt: float column
CouncilArea: categorical column
Lattitude: float column
Longtitude: float column
Regionname: categorical column
Propertycount: float column


# Turn categorical variables into quantitative variables in Python

In [8]:
method_mapping = {
    'PI': 0,
    'S': 1,
    'SA': 2,
    'SP': 3,
    'VB': 4
}

# Step 1: Clean up and standardize strings
df['Method'] = df['Method'].astype(str).str.strip().str.upper()

# Step 2: Apply mapping
df['Method'] = df['Method'].map(method_mapping)

# Step 3: Handle unknowns (NaNs after mapping)
df['Method'] = df['Method'].fillna(-1).astype(int)  # -1 for unknowns

# Check output
print(df['Method'].unique())




[1 3 0 4 2]


# Data Formatting and Data Normalization

In [9]:
df['Car'] = df['Car'].fillna(0)  # or .fillna(0)
df['Car'] = df['Car'].astype(int)
df.head()


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,1,Biggin,3/12/2016,2.5,3067.0,...,1.0,1,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,1,Biggin,4/02/2016,2.5,3067.0,...,1.0,0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,3,Biggin,4/03/2017,2.5,3067.0,...,2.0,0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,0,Biggin,4/03/2017,2.5,3067.0,...,2.0,1,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,4,Nelson,4/06/2016,2.5,3067.0,...,1.0,2,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0
