In [1]:
import pandas as pd

# Load the CSV file
file_path = "missing data.csv"
df = pd.read_csv(file_path)

### a. Find out how many data is missing in each attribute

In [2]:
missing_data = df.isnull().sum()
print(missing_data)

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64


### b. Fill missing values in 'Car' attribute with the mode

In [3]:
car_mode = df['Car'].mode()[0]
df['Car'].fillna(car_mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Car'].fillna(car_mode, inplace=True)


### c. Fill missing values in 'BuildingArea' with linear and quadratic interpolation

In [4]:
df['BuildingArea_linear'] = df['BuildingArea'].interpolate(method='linear')
df['BuildingArea_quadratic'] = df['BuildingArea'].interpolate(method='quadratic')

### d. Fill missing values in 'YearBuilt' with forward fill

In [5]:
df['YearBuilt'].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['YearBuilt'].fillna(method='ffill', inplace=True)
  df['YearBuilt'].fillna(method='ffill', inplace=True)


### e. Remove all rows which don’t have 'CouncilArea' data

In [6]:
df_cleaned = df.dropna(subset=['CouncilArea'])
print(df_cleaned.head())

       Suburb           Address  Rooms Type    Price Method SellerG  \
0  Abbotsford      85 Turner St      2    h  1480000      S  Biggin   
1  Abbotsford   25 Bloomburg St      2    h  1035000      S  Biggin   
2  Abbotsford      5 Charles St      3    h  1465000     SP  Biggin   
3  Abbotsford  40 Federation La      3    h   850000     PI  Biggin   
4  Abbotsford       55a Park St      4    h  1600000     VB  Nelson   

         Date  Distance  Postcode  ...  Landsize  BuildingArea  YearBuilt  \
0  03-12-2016       2.5      3067  ...       202           NaN        NaN   
1  04-02-2016       2.5      3067  ...       156          79.0     1900.0   
2  04-03-2017       2.5      3067  ...       134         150.0     1900.0   
3  04-03-2017       2.5      3067  ...        94           NaN     1900.0   
4  04-06-2016       2.5      3067  ...       120         142.0     2014.0   

   CouncilArea  Lattitude  Longtitude             Regionname  Propertycount  \
0        Yarra   -37.7996    14