In [None]:
import numpy as np
import pandas as pd
import sklearn


✅ Recommended Order in a Full Pipeline (for Skewed Data with Outliers):
🧼 Handle Missing Values

📏 Remove Outliers

🔀 Transform Skewed Features

📊 Scale the Data

🤖 Model Training

In [None]:
df = pd.read_csv('global_housing_market_extended.csv')

Table of Contents:
1. Dealing with Missing Data
2. Dealing with Duplicates
3. Outlier Detection
4. Encode Categorical Features
5. Transformation

##### 1. Dealing with Missing data
Check missing data in each column of the dataset


In [None]:
df.isnull().sum()

delete missing data 

In [None]:
df.dropna(how='all')

Drop columns that have missing values

In [None]:
df.dropna(how='columns')

Drop specific columns that have missing values

In [None]:
df.dropna(subset = ['Column_Name', 'Column_Name'])

Replace missing values with Mean/Median/Mode

In [None]:
df["Price"].fillna(df['Price'].mean())
df["Price"].fillna(df['Price'].median())
df["Price"].fillna(df['Price'].mode())

Forward Fill - Fill missing values with values before them

In [None]:
df.fillna(method = 'ffil')

Backward Fill - FIll missing values with values after them

In [None]:
df.fillna(method = 'bfill')

Fill missing values using the interpolation method

In [None]:
df['stockPrice'] = 
df['stockPrice'].interpolate(method = 'polynomial', order = 2)





#### 2. Dealing with Duplicates
Check if there are duplicates


In [None]:
df.duplicated().sum()

Extract duplicate rows from the dataframe

In [None]:
df.drop_duplicates()

#### 3. Outlier detection
Detect range of values for each column of the dataset


In [None]:
df.describe([x*0.1 for x in range(10)])

Display boxplot to display the distribution of a column

In [None]:
import seaborn as sns
sns.boxplot(x=df['age'])

Display histogram to display the distribution of a column


In [None]:
sns.displot(data=df['column1'])

Remove outliers

In [None]:
df = df[df['age']<df['age'].quantile(0.9)]

Outlier detection with machine learning models, like Isolation Forest

In [None]:
if = IsolationForest(random_state=42)
if.fit(X)
y_pred = if.predict(X)

#### 4. Encode categorical features
Apply one-hot-encoding to a categorical column

In [None]:
from sklearn.prepreprocessing import OneHotEncoder
ohe = OneHotEncoder()
encoded_data =
pd.DataFrame(ohe.fit_transform(df[[‘type_building’,’color’]]).toarray())
new_df = df.join(encoded_data)


Apple label-encoding to a categorical column

In [None]:
from sklearn.prepreprocessing import LabelEncoder
le = LabelEncoder()
df[‘column1’] = le.fit_transform(df[‘price_levels’])


#### 5. Transformation
Standardize features by removing the mean and scaling to unit variance

In [None]:
from sklearn.processing import StandardScaler
X_std = StandardScaler().transform(X)


Rescale features into the range [0,1]


In [None]:
from sklearn.processing import MinMaxScaler
X_mms = MinMaxScaler().transform(X)


Scale features exploiting statistics that are robust to outliers

In [None]:
from sklearn.processing import RobustScaler
X_rs = RobustScaler().transform(X)