In [52]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [53]:
warnings.filterwarnings("ignore")
hrdata = pd.read_csv('https://raw.githubusercontent.com/tkseneee/Dataset/dd7313e0c6487acd9ed7cd32b786782c7a1d3885/HR_data.csv')
print(hrdata.head(2))

   Age          Workclass  fnlwgt   Education  Education_Num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   

        Martial_Status        Occupation    Relationship    Race Gender  \
0        Never-married      Adm-clerical   Not-in-family   White   Male   
1   Married-civ-spouse   Exec-managerial         Husband   White   Male   

   Capital_Gain  Capital_Loss  Hours_per_week         Country  Target  
0          2174             0              40   United-States   <=50K  
1             0             0              13   United-States   <=50K  


In [54]:
#1

In [55]:
missing_values = hrdata.isnull().sum()
print("Missing Values in each column:\n", missing_values)

Missing Values in each column:
 Age                  0
Workclass         2079
fnlwgt               0
Education            0
Education_Num        0
Martial_Status       0
Occupation        2087
Relationship         0
Race                 0
Gender               0
Capital_Gain         0
Capital_Loss         0
Hours_per_week       0
Country            656
Target               0
dtype: int64


In [56]:
#2

In [57]:
numeric_data = hrdata.select_dtypes(include=[np.number])
print("Numeric Data:\n", numeric_data.head(2))

Numeric Data:
    Age  fnlwgt  Education_Num  Capital_Gain  Capital_Loss  Hours_per_week
0   39   77516             13          2174             0              40
1   50   83311             13             0             0              13


In [58]:
#3

In [59]:
range_numeric = numeric_data.max() - numeric_data.min()
print("Range of Numeric Data:\n", range_numeric)

Range of Numeric Data:
 Age                    73
fnlwgt            1472420
Education_Num          15
Capital_Gain        99999
Capital_Loss         4356
Hours_per_week         98
dtype: int64


In [60]:
needs_scaling = range_numeric > 1
print("Need Scaling:\n", needs_scaling)

Need Scaling:
 Age               True
fnlwgt            True
Education_Num     True
Capital_Gain      True
Capital_Loss      True
Hours_per_week    True
dtype: bool


In [61]:
#4

In [62]:
print(numeric_data.columns)

Index(['Age', 'fnlwgt', 'Education_Num', 'Capital_Gain', 'Capital_Loss',
       'Hours_per_week'],
      dtype='object')


In [63]:
capital_gain_column = numeric_data['Capital_Gain']
mean_capital_gain = capital_gain_column.mean()
std_capital_gain = capital_gain_column.std()
scaled = (capital_gain_column - mean_capital_gain) / std_capital_gain

print("'Capital_Gain' Column:\n", scaled.head())

'Capital_Gain' Column:
 0    0.149029
1   -0.146175
2   -0.146175
3   -0.146175
4   -0.146175
Name: Capital_Gain, dtype: float64


In [64]:
#5

In [65]:
min_scaled = scaled.min()
max_scaled = scaled.max()
mean_scaled = scaled.mean()
std_scaled = scaled.std()

print(f"Scaled 'Area' Column - Min: {min_scaled}, Max: {max_scaled}, Mean: {mean_scaled}, Std Dev: {std_scaled}")

Scaled 'Area' Column - Min: -0.14617527774434663, Max: 13.432521690368429, Mean: 2.382935503343069e-17, Std Dev: 1.0


In [66]:
#6

In [67]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data)

scaled_data_df = pd.DataFrame(scaled_data, columns=numeric_data.columns)
scaled_data_df.head()

Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,0.02905,-1.062152,1.134136,0.149031,-0.216957,-0.034641
1,0.834363,-1.007377,1.134136,-0.146177,-0.216957,-2.218354
2,-0.04416,0.243483,-0.420668,-0.146177,-0.216957,-0.034641
3,1.053994,0.423784,-1.19807,-0.146177,-0.216957,-0.034641
4,-0.776263,1.403866,1.134136,-0.146177,-0.216957,-0.034641


In [68]:
#7

In [69]:
scaled_data_mean = scaled_data_df.mean()
scaled_data_std = scaled_data_df.std()

(scaled_data_mean, scaled_data_std)

(Age               4.551893e-17
 fnlwgt            1.279977e-16
 Education_Num    -2.373209e-17
 Capital_Gain     -6.370705e-17
 Capital_Loss     -6.234537e-17
 Hours_per_week    8.996797e-17
 dtype: float64,
 Age               1.000014
 fnlwgt            1.000014
 Education_Num     1.000014
 Capital_Gain      1.000014
 Capital_Loss      1.000014
 Hours_per_week    1.000014
 dtype: float64)

In [70]:
#8

In [71]:
inversed_data = scaler.inverse_transform(scaled_data)
inversed_data_df = pd.DataFrame(inversed_data, columns=numeric_data.columns)
inversed_data_df.head()

Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,39.0,77516.0,13.0,2174.0,0.0,40.0
1,50.0,83311.0,13.0,0.0,0.0,13.0
2,38.0,215646.0,9.0,0.0,0.0,40.0
3,53.0,234721.0,7.0,0.0,0.0,40.0
4,28.0,338409.0,13.0,0.0,0.0,40.0


In [72]:
#9

In [73]:
min_max_scaler = MinMaxScaler()
min_max_scaled_data = min_max_scaler.fit_transform(numeric_data)

min_max_scaled_data_df = pd.DataFrame(min_max_scaled_data, columns=numeric_data.columns)

(max_values, min_values) = (min_max_scaled_data_df.max(), min_max_scaled_data_df.min())
(max_values, min_values)

(Age               1.0
 fnlwgt            1.0
 Education_Num     1.0
 Capital_Gain      1.0
 Capital_Loss      1.0
 Hours_per_week    1.0
 dtype: float64,
 Age               0.0
 fnlwgt            0.0
 Education_Num     0.0
 Capital_Gain      0.0
 Capital_Loss      0.0
 Hours_per_week    0.0
 dtype: float64)

In [74]:
#10

In [75]:
robust_scaler = RobustScaler()
robust_scaled_data = robust_scaler.fit_transform(numeric_data)

robust_scaled_data_df = pd.DataFrame(robust_scaled_data, columns=numeric_data.columns)
robust_scaled_data_df.head()

Unnamed: 0,Age,fnlwgt,Education_Num,Capital_Gain,Capital_Loss,Hours_per_week
0,0.1,-0.841482,1.0,2174.0,0.0,0.0
1,0.65,-0.793073,1.0,0.0,0.0,-5.4
2,0.05,0.312399,-0.333333,0.0,0.0,0.0
3,0.8,0.471744,-1.0,0.0,0.0,0.0
4,-0.45,1.337911,1.0,0.0,0.0,0.0
