# Load Dataset

In [0]:
import numpy as np #for scientific computation
import pandas as pd #for data analysis and data manipulation

In [0]:
#Define the schema for dataset

from pyspark.sql.types import DoubleType, StringType, StructType, StructField

schema = StructType([
            StructField("longitude",DoubleType(),True),
            StructField("latitude",DoubleType(),True),
            StructField("housing_median_age",DoubleType(),True),
            StructField("total_rooms",DoubleType(),True),
            StructField("total_bedrooms",DoubleType(),True),
            StructField("population",DoubleType(),True),
            StructField("households",DoubleType(),True),
            StructField("median_income",DoubleType(),True),
            StructField("median_house_value",DoubleType(),True),
            StructField("ocean_proximity",StringType(),True)
            ])

#Read the data
housing_df = spark.read.csv("dbfs:/FileStore/housing_data_tweaked.csv",schema=schema)  

In [0]:
type(housing_df)
#Output shows it is a spark dataframe

In [0]:
housing_df = housing_df.toPandas()

In [0]:
type(housing_df)

#Now the dataframe type changed to pandas type

In [0]:
housing_df.head()

# 1. Missing value Imputation

Missing value imputation is the process of replacing missing data in a dataset with estimated values. It's a key step in data preprocessing. 

**Why is imputation important?**

Missing values can affect the accuracy of data analysis and machine learning models. 

If incomplete datasets are not imputed well, the results of data mining and analysis could be affected. 

**How is imputation done?**

**_Mean imputation_**
Replaces missing values with the average of the observed values for that variable. 

**_Median imputation_**
Replaces missing values with the median of the observed values for that variable. 

**_Mode imputation_**
Replaces missing values with the most frequent value (mode) of the variable. 

**_K-Nearest Neighbors (KNN) imputation_**
Estimates missing values by finding the K most similar samples in the dataset and using their values to impute the missing data. 

**When should imputation be used? **

Imputation should not be used if more than 50% of data are missing.

Imputation is not appropriate for imputing future data in a time series.

Use of imputation is suspect if it generates values outside valid ranges.

In [0]:
housing_df.isna().sum()

In [0]:
#Fill the missing values for numerical columns
housing_df = housing_df.fillna(housing_df.mean(numeric_only=True))

In [0]:
housing_df.isna().sum()

In [0]:
#Fill the missing values for Categorical columns
housing_df.select_dtypes('object').columns

#Replace the missing values with most frequent value; mode()[0] means we are accessing the first value
housing_df['ocean_proximity'] = housing_df['ocean_proximity'].fillna(housing_df['ocean_proximity'].mode()[0])

# 2. Outlier removal

**What is Outlier?**

An Outlier is a data item/object that deviates significantly from the rest of the (so-called normal) objects. Identifying outliers is important in statistics and data analysis because they can have a significant impact on the results of statistical analyses. The analysis for outlier detection is referred to as outlier mining.

**Methods to remove outliers:**

1. **Z score or standard score** 

Z score = (x -mean) / std. deviation

A normal distribution is shown below and it is estimated that 68% of the data points lie between +/- 1 standard deviation. 95% of the data points lie between +/- 2 standard deviation 99.7% of the data points lie between +/- 3 standard deviation.If the **_z score of a data point is more than 3, it indicates that the data point is quite different from the other data points._** Such a data point can be an outlier. 

## IQR (Interquartile Range)

In [0]:
#Step 1: Define numerical columns

#display(housing_df.select_dtypes(include=['float64','int64']).columns)
#specify the columns explicitly in an array
numerical_cols =['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value']
#print(numerical_cols)

#Step 2: Calculate Q1 (first quartile) and Q3 (third quartile) for each numerical column
Q1 = housing_df[numerical_cols].quantile(0.25)
Q3 = housing_df[numerical_cols].quantile(0.75)

#Step 3: Calculate IQR for each numerical column
IQR = Q3 - Q1

#Step 4: Define lower and upper bounds for outlier detection
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

#Step 5: Filter out outliers from Dataframe
housing_df_no_outliers = housing_df[~((housing_df[numerical_cols] < lower_bound) | (housing_df[numerical_cols] > upper_bound)).any(axis=1)] 

print("Before outlier removal")
display(housing_df.shape)
initial_row_count = len(housing_df)
print("Initial row count:",initial_row_count)

print("\nAfter outlier removal")
display(housing_df_no_outliers.shape)
final_row_count = len(housing_df_no_outliers)
print("Final row count:",final_row_count)

rows_removed = initial_row_count - final_row_count
print("Rows removed:",rows_removed)



## Z-Score

In [0]:
#Step 1: Import package
from scipy import stats

#Step 2: Define numerical columns
numerical_cols =['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value']

#Step 3: Calculate z-scores for each numerical column
z_scores = stats.zscore(housing_df[numerical_cols])

#Step 4: Define threshold for outlier detection
threshold = 3

#Step 5: Filter out outliers from Dataframe
housing_df_no_outliers = housing_df[(z_scores < threshold).all(axis=1)]

print("Before outlier removal")
display(housing_df.shape)
initial_row_count = len(housing_df)
print("Initial row count:",initial_row_count)

print("\nAfter outlier removal")
display(housing_df_no_outliers.shape)
final_row_count = len(housing_df_no_outliers)
print("Final row count:",final_row_count)

rows_removed = initial_row_count - final_row_count
print("Rows removed:",rows_removed)



# 3. Feature Creation

Feature creation is the process of creating new variables from existing data to help improve machine learning models. It's a common step in data preprocessing. 

Examples binning, splitting, calculated features, and creating new features from past values of time series data. 

Examples:

In [0]:
# New feature is created - housing_median_age_in_days with existing column
housing_df["housing_median_age_in_days"] = housing_df["housing_median_age"] * 365

housing_df.head()

In [0]:
housing_df.drop(columns="housing_median_age_in_days", axis=1)

Do not run this

In [0]:
#Polynomial feature
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
polynomial_features = poly.fit_transform(housing_df['feature_1', 'feature_2'])

#Interaction features
housing_df['interaction'] = housing_df['feature_1'] * housing_df['feature_2']

#Binning/ discretization - To convert categorical data to numerical
housing_df['binned_feature'] = pd.cut(housing_df['feature_1'], bins=3, labels=False)
housing_df['age_group'] = pd.cut(housing_df['feature_1'], bins=[0, 20, 40, 60, 80, 100], labels=['Child','Young', 'Adult', 'Senior', 'Elderly'])

#Encoding categorical features
encode_df = pd.get_dummies(housing_df, columns= ['ocean_proximity'], prefix='ocean_proximity')

#Textual features extraction using CountVectorizer for bag of words - For Natural Language Processing
from sklearn.feature_extraction.text import CountVectorizer
text_data = ['text_1', 'text_2', 'text_3']
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(text_data)

#time based feature
housing_df['day_of_week'] = housing_df['date'].dt.dayofweek
housing_df['month'] = housing_df['date'].dt.month

#Domain specific features
housing_df['Profit'] = housing_df['Revenue'] - housing_df['Cost']

# 4. Feature Scaling

**Feature Scaling:** Feature scaling is a method used to standardize the range of independent variables or features of data. In data processing, it is also known as data normalization and is generally performed during the data pre-processing step.

2 common methods:

**Z-Score Normalization :** Z-score normalization, also known as standardization, transforms data into a standard normal distribution with a mean of 0 and a standard deviation of 1. This technique is useful when the data follows a normal distribution.

**Example:** Suppose we have a dataset with a feature "Height" with a mean of 175 cm and a standard deviation of 10 cm. To normalize this feature using z-score normalization, we would subtract the mean from each height and then divide by the standard deviation. This would result in a normalized feature with a mean of 0 and a standard deviation of 1.

**Min-Max Normalization** Min-max scaling, also known as rescaling, is a popular normalization technique that rescales the data to a common range, usually between 0 and 1. This is achieved by subtracting the minimum value and then dividing by the range of the data.

**Example:** Suppose we have a dataset with a feature "Age" ranging from 18 to 80. To normalize this feature using min-max scaling, we would subtract 18 (the minimum value) from each age and then divide by 62 (the range of the data). This would result in a normalized feature with values between 0 and 1.

**Why Scale?**

Robustness to Outliers: Scaling can make your models less sensitive to extreme values.

Algorithm Compatibility: Some algorithms, like Support Vector Machines and Principal Component Analysis, work best with scaled data.

In [0]:
print("Before applying feature scaling")
housing_df.head()

In [0]:
#Step 1: Import packages
from sklearn.preprocessing import StandardScaler

#Step 2: Define numerical columns
numerical_features =['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value']

#Step 3: Define function to scale numerical columns
stdard_scaler = StandardScaler()
housing_df[numerical_features] = stdard_scaler.fit_transform(housing_df[numerical_features])

print("After applying feature scaling")
housing_df.head()
#Numerical values are now scaled to common scale

# 5. One Hot Encoding

**One Hot Encoding** is a method for converting categorical variables into a binary format. It creates new columns for each category where 1 means the category is present and 0 means it is not. The primary purpose of One Hot Encoding is to ensure that categorical data can be effectively used in machine learning models.

It is called one-hot encoding because only one column (or feature) corresponding to a particular category has the value 1, while all others are set to 0. For example, if the categories are Male and Female, the Male row will have [1, 0] and Female will have [0, 1]. This way, only one “hot” bit (1) is activated for each entry.

**Example:**

Wherever the fruit is “Apple,” the Apple column will have a value of 1 while the other fruit columns (like Mango or Orange) will contain 0.
This pattern ensures that each categorical value gets its own column represented with binary values (1 or 0) making it usable for machine learning models.

**Advantages of Using One Hot Encoding**

It allows the use of categorical variables in models that require numerical input.

It can improve model performance by providing more information to the model about the categorical variable.

It can help to avoid the problem of ordinality which can occur when a categorical variable has a natural ordering (e.g. “small”, “medium”, “large”).

In [0]:
housing_df.select_dtypes(include='object').columns

In [0]:
housing_df['ocean_proximity'].unique()

In [0]:
housing_df = pd.get_dummies(housing_df, drop_first=True)

In [0]:
housing_df.head() #new columns are added

In [0]:
housing_df.shape

# 6. Feature Selection

**Feature Selection Foundation**

Feature selection is a important step in machine learning which involves selecting a subset of relevant features from the original feature set to reduce the feature space while improving the model’s performance by reducing computational power. It’s a critical step in the machine learning especially when dealing with high-dimensional data.

**There are various algorithms used for feature selection and are grouped into three main categories:**

Filter Methods

Wrapper Methods

Embedded Methods

In [0]:
#Store all independent variables in a dataframe X
X = housing_df.drop('median_house_value', axis=1)

#Store dependent variable in pandas series y
y = housing_df['median_house_value']

In [0]:
X.head()

In [0]:
y.head()

In [0]:
type(X)

In [0]:
type(y)

In [0]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

#Create Feature selection model - Linear Regression estimator and 5 features to select
estimator = LinearRegression()
rfe = RFE(estimator, n_features_to_select=5)

#Fit the feature selection model into data
rfe.fit(X, y)

#Get the selected features
selected_features = X.columns[rfe.support_]
print(selected_features)


# 7. Feature transformation (Optional)

Feature transformation involves changing the original features in your dataset to new representations that may be more suitable for model building. This process can improve the performance of machine learning models by making the data more understandable and manageable for algorithms.

The important transformation techniques we are going to read in this blog:

Scaling- Normalization, Standardization

Log Transformation

Box Cox Transformation

Encoding Categorical Variables

Binning

Reciprocal Transformation

Polynomial Features

Interaction features

In [0]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import boxcox

data = pd.DataFrame({
        'feature1': [1, 2, 3, 4, 5],
        'feature2': [10, 20, 30, 40, 50],
        'feature3': [100, 200, 300, 400, 500]
})

In [0]:
print(data)

In [0]:
#Normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)

#Standardization
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)

#Logarthmic transformation
log_data = np.log(data['feature1'])

#Power Transformation
power_data = np.sqrt(data['feature2'])

#Boxcox Transformation
boxcox_data = boxcox(data['feature3'])

#Binning
binned_data = pd.cut(data['feature3'], bins=3, labels=['low', 'medium', 'high'])

#Polynomial Transformation
poly_data = pd.DataFrame({
    'feature_squared': data['feature1'] ** 2,
    'feature_cubed': data['feature1'] ** 3
})

#Interaction terms
interaction_data = data['feature1'] * data['feature2']

In [0]:
print("Normalized data", normalized_data)
print("\nStandardized data", standardized_data)
print("\nLogarthmic data:", log_data)
print("\nPower Transformation data:", power_data)
print("\nBoxcox Transformation data:", boxcox_data)
print("\nBinned data:", binned_data)
print("\nPolynomial Transformation data:", poly_data)
print("interaction terms:", interaction_data)

# 8. Dimensionality reduction

Dimensionality reduction is the process of reducing the number of features (or dimensions) in a dataset while retaining as much information as possible.

two key approaches to reduce dimension:

**1. Feature Selection:**  This method chooses the most relevant features from the dataset without altering them. It helps remove redundant or irrelevant features, improving model efficiency. There are several methods for feature selection including filter methods, wrapper methods, and embedded methods.

**2. Feature Extraction** This method involves creating new features by combining or transforming the original features. There are several methods for feature extraction stated above in the introductory part which is responsible for creating and transforming the features. PCA is a popular technique that projects the original features onto a lower-dimensional space while preserving as much of the variance as possible.



In [0]:
from sklearn.decomposition import PCA

#Create PCA model
pca = PCA(n_components=2)

#Fit the PCA model into data
pca.fit(X)

#Transform X to new feature space
X_reduced = pca.transform(X)

#Print shape of reduced feature space
print(X_reduced.shape)

In [0]:
#Print number of principle components
print(pca.n_components_)

In [0]:
print(X_reduced)