In [25]:

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
warnings.filterwarnings("ignore")
hrdata=pd.read_csv('https://raw.githubusercontent.com/tkseneee/Dataset/dd7313e0c6487acd9ed7cd32b786782c7a1d3885/HR_data.csv')
hrdata.head(2)

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Gender,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


1. Checking for missing values:

In [None]:
hrdata.isnull().sum()

Age                  0
Workclass         2079
fnlwgt               0
Education            0
Education_Num        0
Martial_Status       0
Occupation        2087
Relationship         0
Race                 0
Gender               0
Capital_Gain         0
Capital_Loss         0
Hours_per_week       0
Country            656
Target               0
dtype: int64

2. Keep only the Numeric values in one variable

In [None]:
hrdata_num=hrdata.select_dtypes(include=np.number)
hrdata_num.isnull().sum()

Age               0
fnlwgt            0
Education_Num     0
Capital_Gain      0
Capital_Loss      0
Hours_per_week    0
dtype: int64

3. Compute the Range and comment about the need of scaling for this dataset

In [None]:
range = hrdata_num.max() - hrdata_num.min()
print(range)

Age                    73
fnlwgt            1472420
Education_Num          15
Capital_Gain        99999
Capital_Loss         4356
Hours_per_week         98
dtype: int64


In [None]:
need_scaling=range>1
print(need_scaling)

Age               True
fnlwgt            True
Education_Num     True
Capital_Gain      True
Capital_Loss      True
Hours_per_week    True
dtype: bool


4. Perform standard scaling on the 'Area' column without using sklearn function

In [None]:
capital_gain_col=hrdata_num['Capital_Gain']
scaled_data=(capital_gain_col-capital_gain_col.mean())/capital_gain_col.std()


In [None]:
print(scaled_data.head())

0    0.149029
1   -0.146175
2   -0.146175
3   -0.146175
4   -0.146175
Name: Capital_Gain, dtype: float64


5. What is the min,max, mean and standard deviation of standard scaled data. Check these value for the scaled area column computed in the previous question

In [23]:
min_scaled=scaled_data.min()
max_scaled=scaled_data.max()
mean_scaled=scaled_data.mean()
std_scaled=scaled_data.std()

print(f"Scaled Min: {min_scaled}, Scaled Max: {max_scaled}, Scaled Mean: {mean_scaled}, Scaled Std Dev: {std_scaled}")


Scaled Min: -0.14617527774434663, Scaled Max: 13.432521690368429, Scaled Mean: 2.382935503343069e-17, Scaled Std Dev: 1.0


6. Apply Standard Scaler to all the column of the data using sklearn function

In [26]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(hrdata_num)

scaled_data_df = pd.DataFrame(scaled_data, columns=hrdata_num.columns)
scaled_data_df.head()

Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,0.02905,-1.062152,1.134136,0.149031,-0.216957,-0.034641
1,0.834363,-1.007377,1.134136,-0.146177,-0.216957,-2.218354
2,-0.04416,0.243483,-0.420668,-0.146177,-0.216957,-0.034641
3,1.053994,0.423784,-1.19807,-0.146177,-0.216957,-0.034641
4,-0.776263,1.403866,1.134136,-0.146177,-0.216957,-0.034641


7. Check whether all the scaled data mean and standard deviation is 0 and 1 respectively

In [30]:
scaled_data_mean = scaled_data_df.mean()
scaled_data_std = scaled_data_df.std()

mean_check = np.allclose(scaled_data_mean, 0, atol=1e-2)
std_dev_check = np.allclose(scaled_data_std, 1, atol=1e-2)

(scaled_data_mean, scaled_data_std)

(Age               4.551893e-17
 fnlwgt            1.279977e-16
 Education_Num    -2.373209e-17
 Capital_Gain     -6.370705e-17
 Capital_Loss     -6.234537e-17
 Hours_per_week    8.996797e-17
 dtype: float64,
 Age               1.000014
 fnlwgt            1.000014
 Education_Num     1.000014
 Capital_Gain      1.000014
 Capital_Loss      1.000014
 Hours_per_week    1.000014
 dtype: float64)

In [35]:
mean_scaled_data = scaled_data_df.mean()
std_scaled_data = scaled_data_df.std()

(mean_scaled_data, std_scaled_data)

(Age               4.551893e-17
 fnlwgt            1.279977e-16
 Education_Num    -2.373209e-17
 Capital_Gain     -6.370705e-17
 Capital_Loss     -6.234537e-17
 Hours_per_week    8.996797e-17
 dtype: float64,
 Age               1.000014
 fnlwgt            1.000014
 Education_Num     1.000014
 Capital_Gain      1.000014
 Capital_Loss      1.000014
 Hours_per_week    1.000014
 dtype: float64)

8.Inverse the scaled data to the original form

In [28]:
inversed_data = scaler.inverse_transform(scaled_data)
inversed_data_df = pd.DataFrame(inversed_data, columns=hrdata_num.columns)
inversed_data_df.head()

Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,39.0,77516.0,13.0,2174.0,0.0,40.0
1,50.0,83311.0,13.0,0.0,0.0,13.0
2,38.0,215646.0,9.0,0.0,0.0,40.0
3,53.0,234721.0,7.0,0.0,0.0,40.0
4,28.0,338409.0,13.0,0.0,0.0,40.0


9.Apply Min-max scaling to the original numeric data and print its max and minimum values for all the columns

In [34]:
minmax_scaler = MinMaxScaler()
minmax_scaled_data = minmax_scaler.fit_transform(hrdata_num)

minmax_scaled_data_df = pd.DataFrame(minmax_scaled_data, columns=hrdata_num.columns)

(max_values, min_values) = (minmax_scaled_data_df.max(), minmax_scaled_data_df.min())
(max_values, min_values)

(Age               1.0
 fnlwgt            1.0
 Education_Num     1.0
 Capital_Gain      1.0
 Capital_Loss      1.0
 Hours_per_week    1.0
 dtype: float64,
 Age               0.0
 fnlwgt            0.0
 Education_Num     0.0
 Capital_Gain      0.0
 Capital_Loss      0.0
 Hours_per_week    0.0
 dtype: float64)

10. Apply Robust Scaler to scale the data

In [33]:
robust = RobustScaler()
robustdata = robust.fit_transform(hrdata_num)

robustdata_df = pd.DataFrame(robustdata, columns=hrdata_num.columns)
robustdata_df.head()

Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,0.1,-0.841482,1.0,2174.0,0.0,0.0
1,0.65,-0.793073,1.0,0.0,0.0,-5.4
2,0.05,0.312399,-0.333333,0.0,0.0,0.0
3,0.8,0.471744,-1.0,0.0,0.0,0.0
4,-0.45,1.337911,1.0,0.0,0.0,0.0
