# Feature Scaling
1. Standardization
2. Normalization

## **Standardization**

In [45]:
# Import packages
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#StandardScaler 是标准化方法，它将数据转换为具有 零均值（mean = 0）和单位标准差（standard deviation = 1） 的数据。
#MinMaxScaler 是归一化方法，它将数据缩放到指定的 最小值和最大值 之间，通常是 [0, 1]。

In [46]:

# Import data into 'original'
df_housing = pd.read_csv('housing_data.csv')

df_housing.head()

Unnamed: 0,Price,Age,Bedrooms,Bathrooms,Square_Footage
0,201958,37,2,1,2231
1,226867,7,1,2,2095
2,211932,21,4,1,3904
3,445838,9,4,3,1736
4,339178,39,4,3,2453


#### Assume that we want to predict housing price by age, number of bedrooms, number of bathrooms and square footage. We need to standardize these four features.

### <span style= "color:darkred"> Standardize </span> dataframe features and **save the standardized feature in the original array** -- using
`preprocessing.StandardScaler().fit_transform()`

In [49]:
# Standardize dataframe and return as an array
# Initialize StandardScaler
scaler = preprocessing.StandardScaler()
df_4columns=df_housing[['Age', 'Bedrooms', 'Bathrooms', 'Square_Footage']]
df_housing[['S_Age', 'S_Bedrooms', 'S_Bathrooms', 'S_Square_Footage']] = scaler.fit_transform(df_4columns)

df_housing.head()

Unnamed: 0,Price,Age,Bedrooms,Bathrooms,Square_Footage,S_Age,S_Bedrooms,S_Bathrooms,S_Square_Footage
0,201958,37,2,1,2231,0.948342,-0.514038,-1.171659,-0.049078
1,226867,7,1,2,2095,-1.152859,-1.208683,0.023911,-0.1867
2,211932,21,4,1,3904,-0.172298,0.875253,-1.171659,1.643872
3,445838,9,4,3,1736,-1.012779,0.875253,1.219482,-0.549981
4,339178,39,4,3,2453,1.088422,0.875253,1.219482,0.175569


### Drop the original columns

In [51]:
df_housing.drop(columns=['Age', 'Bedrooms', 'Bathrooms', 'Square_Footage'], inplace=True)
df_housing.head()

Unnamed: 0,Price,S_Age,S_Bedrooms,S_Bathrooms,S_Square_Footage
0,201958,0.948342,-0.514038,-1.171659,-0.049078
1,226867,-1.152859,-1.208683,0.023911,-0.1867
2,211932,-0.172298,0.875253,-1.171659,1.643872
3,445838,-1.012779,0.875253,1.219482,-0.549981
4,339178,1.088422,0.875253,1.219482,0.175569


### Alternatively, we can directly overwrite the columns with standardized values.

In [53]:
df_housing = pd.read_csv('housing_data.csv')

df_housing.head()

Unnamed: 0,Price,Age,Bedrooms,Bathrooms,Square_Footage
0,201958,37,2,1,2231
1,226867,7,1,2,2095
2,211932,21,4,1,3904
3,445838,9,4,3,1736
4,339178,39,4,3,2453


In [54]:
# Standardize dataframe and return as an array
# Initialize StandardScaler
scaler = StandardScaler()

df_housing[['Age', 'Bedrooms', 'Bathrooms', 'Square_Footage']] = scaler.fit_transform(df_housing[['Age', 'Bedrooms', 'Bathrooms', 'Square_Footage']])

df_housing.head()

Unnamed: 0,Price,Age,Bedrooms,Bathrooms,Square_Footage
0,201958,0.948342,-0.514038,-1.171659,-0.049078
1,226867,-1.152859,-1.208683,0.023911,-0.1867
2,211932,-0.172298,0.875253,-1.171659,1.643872
3,445838,-1.012779,0.875253,1.219482,-0.549981
4,339178,1.088422,0.875253,1.219482,0.175569


## **Normalization**

### <span style= "color:darkred"> Normalize </span> dataframe and add the normalized features/columns as new features/columns

In [57]:
df_housing = pd.read_csv('housing_data.csv')

df_housing.head()

Unnamed: 0,Price,Age,Bedrooms,Bathrooms,Square_Footage
0,201958,37,2,1,2231
1,226867,7,1,2,2095
2,211932,21,4,1,3904
3,445838,9,4,3,1736
4,339178,39,4,3,2453


In [58]:
# Standardize dataframe and return as an array
# Initialize StandardScaler
scaler = MinMaxScaler()

df_housing[['Age', 'Bedrooms', 'Bathrooms', 'Square_Footage']] = scaler.fit_transform(df_housing[['Age', 'Bedrooms', 'Bathrooms', 'Square_Footage']]).round(2)

df_housing.head()

Unnamed: 0,Price,Age,Bedrooms,Bathrooms,Square_Footage
0,201958,0.76,0.25,0.0,0.49
1,226867,0.11,0.0,0.5,0.45
2,211932,0.41,0.75,0.0,1.0
3,445838,0.15,0.75,1.0,0.34
4,339178,0.8,0.75,1.0,0.56


### <span style="color:orangered"> Practice </span>
Given dataset `lifestyle_example_cleaned.csv`, which is stored in the `Data cleaning/` folder, 
use standard scaler to standardize column `sunshine_hours` and `obesity_levels` and replace the original columns with the standardized columns


In [99]:
df_clean = pd.read_csv('../Data cleaning/lifestyle_example_cleaned.csv')
df_clean.head()

Unnamed: 0,sunshine_hours,cost_bottled_water,obesity_levels,life_expectancy
0,1858.0,1.92,20.4,81.2
1,1884.0,1.94,20.1,81.0
2,1821.0,1.72,20.6,81.8
3,1630.0,2.19,19.7,79.8
4,1662.0,1.6,22.2,80.4


In [95]:
scaler = preprocessing.StandardScaler()
df_2columns=df_clean[['sunshine_hours', 'obesity_levels']]
#df[column_name] 是用来选择单个列的，但是如果你想选择多个列，就需要使用一个包含多个列名的列表。
#内层方括号包含了列的名字，即 sunshine_hours 和 obesity_levels，这两个列名作为一个列表传递给 DataFrame。
df_clean[['sunshine_hours', 'obesity_levels']] = scaler.fit_transform(df_2columns)

df_clean.head()

Unnamed: 0,sunshine_hours,cost_bottled_water,obesity_levels,life_expectancy
0,-0.678115,1.92,0.882473,81.2
1,-0.632422,1.94,0.856705,81.0
2,-0.743141,1.72,0.899652,81.8
3,-1.078812,2.19,0.822347,79.8
4,-1.022574,1.6,1.037081,80.4
