# Lab Assignment 2
*Hariesh R - 23110344*

In [39]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
import warnings

In [40]:
warnings.filterwarnings("ignore")

# Robust Scaler

RobustScaler is a type of scaler in machine learning that is used to scale numerical features in a dataset. It is a part of the preprocessing module in scikit-learn, a popular Python library for machine learning.

The RobustScaler is similar to the StandardScaler, but it is more robust to outliers in the data. While the StandardScaler uses the mean and standard deviation of the data to scale the features, the RobustScaler uses the median and interquartile range (IQR) instead.

The median is used as the central tendency measure instead of the mean, which makes it more robust to outliers.

The IQR (interquartile range) is used as the measure of spread instead of the standard deviation. The IQR is the difference between the 75th percentile and the 25th percentile, which makes it more resistant to outliers.

By using the median and IQR, the RobustScaler is less affected by extreme values in the data, which can be useful when dealing with datasets that contain outliers.

In [41]:
X = [[ 1., -2.,  2.],
    [ -2.,  1.,  3.],
    [ 4.,  1., -2.]]

In [42]:
robustscaler = RobustScaler()

In [43]:
robustscaler.fit_transform(X)

array([[ 0. , -2. ,  0. ],
       [-1. ,  0. ,  0.4],
       [ 1. ,  0. , -1.6]])

# Standard Scaler

In [44]:
X = [[ 1., -1.,  2.],
    [ 2.,  0.,  0.],
    [ 0.,  1., -1.]]

In [45]:
standardscaler = StandardScaler()

In [46]:
standardscaler.fit_transform(X)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

# MaxAbsScaler

In [47]:
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]

In [48]:
maxabsscaler = MaxAbsScaler()

In [49]:
maxabsscaler.fit_transform(X)

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

# OneHotEncoding

### Method 1

In [50]:
data = [['Male', 1], ['Female', 3], ['Female', 2]]

In [51]:
df = pd.DataFrame(data=data, columns=['Gender', 'Value'])
df

Unnamed: 0,Gender,Value
0,Male,1
1,Female,3
2,Female,2


In [52]:
encoded_data = pd.get_dummies(data=df)
encoded_data

Unnamed: 0,Value,Gender_Female,Gender_Male
0,1,False,True
1,3,True,False
2,2,True,False


In [53]:
encoded_data = encoded_data[['Gender_Male', 'Gender_Female', 'Value']]
encoded_data

Unnamed: 0,Gender_Male,Gender_Female,Value
0,True,False,1
1,False,True,3
2,False,True,2


In [54]:
encoded_data.to_numpy()

array([[True, False, 1],
       [False, True, 3],
       [False, True, 2]], dtype=object)

### Method 2

In [55]:
encoder = OneHotEncoder(sparse=False)

In [56]:
encoded_gender = encoder.fit_transform(df[['Gender']])
encoded_gender

array([[0., 1.],
       [1., 0.],
       [1., 0.]])

In [57]:
encoded_gender_df = pd.DataFrame(encoded_gender, columns=encoder.get_feature_names_out(['Gender']))
encoded_gender_df

Unnamed: 0,Gender_Female,Gender_Male
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0


In [58]:
result = pd.concat([encoded_gender_df, df['Value']], axis=1)
result

Unnamed: 0,Gender_Female,Gender_Male,Value
0,0.0,1.0,1
1,1.0,0.0,3
2,1.0,0.0,2


In [59]:
result.to_numpy()

array([[0., 1., 1.],
       [1., 0., 3.],
       [1., 0., 2.]])

# MultiLabelBinarizer

In [60]:
data = [{'sci-fi', 'thriller'}, {'comedy'}]

In [61]:
mlb = MultiLabelBinarizer()

In [62]:
mlb.fit_transform(data)

array([[0, 1, 1],
       [1, 0, 0]])

In [63]:
mlb.classes_

array(['comedy', 'sci-fi', 'thriller'], dtype=object)