In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import pandas as pd

In [0]:
input_path="/Volumes/ml/default/raw/housing.csv"

In [0]:
users_schema=StructType([StructField("longitude",DoubleType(),True),
                         StructField("latitude",DoubleType(),True),
                         StructField("housing_median_age",DoubleType(),True),
                         StructField("total_rooms",DoubleType(),True),
                         StructField("total_bedrooms",DoubleType(),True),
                         StructField("population",DoubleType(),True),
                         StructField("households",DoubleType(),True),
                         StructField("median_income",DoubleType(),True),
                         StructField("median_house_value",DoubleType(),True),
                         StructField("ocean_proximity",StringType(),True)          
])

In [0]:
df=spark.read.csv(input_path,header=True,schema=users_schema)
df=df.toPandas()

### 1. Missing value imputation

In [0]:
df.isnull().sum()

In [0]:
df.fillna(df.mean(numeric_only=True),inplace=True)

In [0]:
df.isnull().sum()

In [0]:
#df['ocean_proximity'].fillna(df['ocean_proximity'].mode()[0],inplace=True)

### 2.outlier removal


In [0]:
### 🧠 What is Outlier Removal?

Outlier removal is the process of identifying and handling values in your dataset that are **unusually high or low** compared to the rest.
These **extreme values** can negatively affect the accuracy of your analysis or machine learning models.

---

### 📌 Common Methods to Remove Outliers

---

#### 1. **Z-Score Method**

* Calculates how far each value is from the mean using **z-scores**.
* Any value **more than 3 standard deviations** away from the mean is considered an outlier.
* ✅ Best for **normally distributed** data.

---

#### 2. **IQR (Interquartile Range) Method**

* Focuses on the **middle 50%** of the data.
* Outliers are values that fall **1.5 times the IQR below Q1 or above Q3**.
* ✅ Works well for **non-normal distributions**.

---

#### 3. **Tukey’s Fences Method**

* Similar to the IQR method.
* Builds **"fences"** using Q1, Q3, and IQR.
* Values **outside the fences** are considered outliers.
* ✅ Robust for **skewed data**.

---

#### 4. **Standard Deviation Method**

* Computes the **mean and standard deviation**.
* Outliers are values **more than 3 standard deviations** from the mean.
* ✅ Suitable for **normal distributions**.

---

#### 5. **Percentile Method**

* Removes the **top and bottom X%** of values (e.g., 1st and 99th percentiles).
* ✅ Does **not require normal distribution**.
* ✅ Great for datasets with **extreme values** on either end.

---

### 📝 Comparison Table

| Method             | Good for Normal Data | Works on Any Data | Easy to Apply |
| ------------------ | -------------------- | ----------------- | ------------- |
| Z-Score            | ✅ Yes                | ❌ No              | ✅ Yes         |
| IQR                | ❌ No                 | ✅ Yes             | ✅ Yes         |
| Tukey’s Fences     | ❌ No                 | ✅ Yes             | ✅ Yes         |
| Standard Deviation | ✅ Yes                | ❌ No              | ✅ Yes         |
| Percentile         | ✅ Yes                | ✅ Yes             | ✅ Yes         |

---

### 🚀 Pro Tip:

There is **no single best method** for all situations.
Choose the one that fits **your data’s distribution** and **business need** — or try **a combination**!


In [0]:
IQR

In [0]:
numerical_columns = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                     'total_bedrooms', 'population', 'households', 'median_income',
                     'median_house_value']


Q1=df[numerical_columns].quantile(0.25)
Q3=df[numerical_columns].quantile(0.75)

IQR=Q3-Q1

#define lower and upper bound
lower_bound=Q1-1.5*IQR
upper_bound=Q3+1.5*IQR

df_no_outliers = df[~((df[numerical_columns] < lower_bound) | (df[numerical_columns] > upper_bound)).any(axis=1)]

In [0]:
df.shape

In [0]:
df_no_outliers.shape

In [0]:
outlier_removed_row=len(df)-len(df_no_outliers)
outlier_removed_row

In [0]:
from scipy import stats

# Define the numerical columns
numerical_columns = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                     'total_bedrooms', 'population', 'households', 'median_income',
                     'median_house_value']

# Calculate z-scores for numerical columns
z_scores = stats.zscore(df[numerical_columns])

# Define the threshold for outlier detection
threshold = 3

# Filter out the outliers from the DataFrame
df_no_outliers = df[(z_scores < threshold).all(axis=1)]

In [0]:
df_no_outliers.shape

In [0]:
outlier_removed_row=len(df)-len(df_no_outliers)
outlier_removed_row

### 3. Feature Creation

In [0]:
df.head()

In [0]:
df["housing_median_age_days"]=df["housing_median_age"]*365

In [0]:
df.head()

### 4. One hot encoding( Feature Encoding)

In [0]:
df_encoded = pd.get_dummies(df, columns=['ocean_proximity'])