Handling Numerical data Gungun Jain | T085

3.1 Rescaling a feature

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv("/content/adult.csv")
numeric_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print(df_scaled.head())

        age workclass    fnlwgt     education  education.num marital.status  \
0  1.000000         ?  0.043987       HS-grad       0.533333        Widowed   
1  0.890411   Private  0.081896       HS-grad       0.533333        Widowed   
2  0.671233         ?  0.118021  Some-college       0.600000        Widowed   
3  0.506849   Private  0.086982       7th-8th       0.200000       Divorced   
4  0.328767   Private  0.171404  Some-college       0.600000      Separated   

          occupation   relationship   race     sex  capital.gain  \
0                  ?  Not-in-family  White  Female           0.0   
1    Exec-managerial  Not-in-family  White  Female           0.0   
2                  ?      Unmarried  Black  Female           0.0   
3  Machine-op-inspct      Unmarried  White  Female           0.0   
4     Prof-specialty      Own-child  White  Female           0.0   

   capital.loss  hours.per.week native.country income  
0      1.000000        0.397959  United-States  <=50K  
1   

3.2 Standardizing a Feature

In [None]:
import pandas as pd
from sklearn import preprocessing
# load your dataset
df = pd.read_csv("/content/adult.csv")

# create a feature
feature = df[['age']].values

# create scaler
scaler = preprocessing.StandardScaler()

# transform the feature
standardized = scaler.fit_transform(feature)

print(standardized[:10])  # print first 10 standardized values
print("Mean and Standard Deviation")
print("Mean: {}".format(round(standardized.mean())))
print("Standard Deviation: {}".format(standardized.std()))
print("Using RobustScaler")

# create scaler
robust_scaler = preprocessing.RobustScaler()

# transform feature
robust_scaled = robust_scaler.fit_transform(feature)

print(robust_scaled[:10])  # print first 10 robust scaled values)


[[ 3.76961234]
 [ 3.18311167]
 [ 2.01011032]
 [ 1.13035932]
 [ 0.17729573]
 [-0.33589236]
 [-0.04264203]
 [ 2.59661099]
 [ 2.15673549]
 [ 0.17729573]]
Mean and Standard Deviation
Mean: 0
Standard Deviation: 1.0
Using RobustScaler
[[ 2.65]
 [ 2.25]
 [ 1.45]
 [ 0.85]
 [ 0.2 ]
 [-0.15]
 [ 0.05]
 [ 1.85]
 [ 1.55]
 [ 0.2 ]]


3.3 Normalizing Observations

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer

# load dataset
df = pd.read_csv("/content/adult.csv")

# create feature matrix (changed to valid numeric columns)
features = df[['age', 'fnlwgt']].values

# create normalizer
normalizer = Normalizer(norm="l2")

# transform feature matrix
print(normalizer.transform(features[:10]))  # print first 10 normalized rows


[[1.16802641e-03 9.99999318e-01]
 [6.17144460e-04 9.99999810e-01]
 [3.54722354e-04 9.99999937e-01]
 [3.84727705e-04 9.99999926e-01]
 [1.54913983e-04 9.99999988e-01]
 [1.56780284e-04 9.99999988e-01]
 [2.52322354e-04 9.99999968e-01]
 [8.34856091e-04 9.99999652e-01]
 [1.61132475e-04 9.99999987e-01]
 [5.85404757e-04 9.99999829e-01]]


In [None]:
# L1 Normalization
features_l1_norm = Normalizer(norm="l1").transform(features)

print("Sum of the first observation's values: {}".format(
    features_l1_norm[0].sum()
))


Sum of the first observation's values: 1.0


3.4 Grouping Observations Using Clustering

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
# load your dataset
df = pd.read_csv("/content/adult.csv")
# create feature matrix (changed to valid numeric columns)
features = df[['age', 'fnlwgt']].values
# make k-means clusterer
clusterer = KMeans(3, random_state=0)
# fit clusterer
clusterer.fit(features)
# predict values
df['group'] = clusterer.predict(features)
df.head(6)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,group
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,1
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,1
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,1
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,0
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K,0


3.5 Deleting Observations with Missing Values

In [None]:
import pandas as pd

# load your dataset
df = pd.read_csv("/content/adult.csv")

# select two columns (changed to valid numeric columns)
df2 = df[['age', 'fnlwgt']]

# drop rows with missing values
df2.dropna()[:10]

Unnamed: 0,age,fnlwgt
0,90,77053
1,82,132870
2,66,186061
3,54,140359
4,41,264663
5,34,216864
6,38,150601
7,74,88638
8,68,422013
9,41,70037


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# load your dataset
df = pd.read_csv("adult.csv")

# make feature matrix (changed to valid numeric columns)
features = df[['age', 'fnlwgt']].values

# standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# introduce a missing value
true_value = standardized_features[0, 0]
standardized_features[0, 0] = np.nan

# create imputer
mean_imputer = SimpleImputer(strategy="mean")

# impute values
features_mean_imputed = mean_imputer.fit_transform(standardized_features)

# compare true and imputed values
print("True Value: {}".format(true_value))
print("Imputed Value: {}".format(features_mean_imputed[0, 0]))


True Value: 3.7696123367431658
Imputed Value: -0.00011577433466659666
