# Task 2: Data Cleaning & Preprocessing
Dataset: House Prediction

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import openpyxl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


Load Dataset

In [14]:
import pandas as pd

file_path = r"C:\Users\user\Desktop\level1_task2_data_cleaning_and_Preprocessing\house Prediction DataSet.csv"
df = pd.read_csv(file_path)

print("First 5 rows of dataset:")
print(df.head())

print("\n Dataset Info:")
print(df.info())

print("\n Missing Values:")
print(df.isnull().sum())


First 5 rows of dataset:
  0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00
0   0.02731   0.00   7.070  0  0.4690  6.4210  78...                                             
1   0.02729   0.00   7.070  0  0.4690  7.1850  61...                                             
2   0.03237   0.00   2.180  0  0.4580  6.9980  45...                                             
3   0.06905   0.00   2.180  0  0.4580  7.1470  54...                                             
4   0.02985   0.00   2.180  0  0.4580  6.4300  58...                                             

 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505 entries, 0 to 504
Data columns (total 1 columns):
 #   Column                                                                                            Non-Null Count  Dtype 
---  ------                                                                                            --------------  ----- 
 0    0.00632  

Handle Missing Data

In [16]:
# Strategy: 
# - Numeric → fill with median
# - Categorical → fill with mode (most frequent)

for col in df.columns:
    if df[col].dtype in ["int64", "float64"]:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

print("\n Missing values handled!")
print(df.isnull().sum())



 Missing values handled!
0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Detect & Remove Outliers

In [17]:
# Let's use IQR method for numerical columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

print("\n Outliers removed using IQR method")



 Outliers removed using IQR method


Encode Categorical Variables

In [18]:
# Convert categorical to numerical using One-Hot Encoding
cat_cols = df.select_dtypes(include=["object"]).columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("\n Categorical features encoded")
print(df.head())


 Categorical features encoded
   0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00_ 0.01096  55.00   2.250  0  0.3890  6.4530  31.90  7.3073   1  300.0  15.30 394.72   8.23  22.00  \
0                                              False                                                                                                                                                  
1                                              False                                                                                                                                                  
2                                              False                                                                                                                                                  
3                                              False                                                                                                                         

Normalize or Standardize Numerical Data

In [8]:
# Identify numeric columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
print("Numeric columns:", num_cols)

# Scale only if numeric columns exist
from sklearn.preprocessing import StandardScaler

if len(num_cols) > 0:
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    print("\n Numerical features standardized")
else:
    print("\n No numeric columns found to scale")



Numeric columns: Index([], dtype='object')

 No numeric columns found to scale


Save Cleaned Dataset

In [22]:
# Save cleaned dataset
df.to_csv("house_prediction_cleaned.csv", index=False)
print("\n Cleaned dataset saved as 'house_prediction_cleaned.csv'")



 Cleaned dataset saved as 'house_prediction_cleaned.csv'
