In [None]:
# what is function transformer in machine learning ?

# The Function Transformer is a tool in scikit-learn, a popular
# Python library for machine learning, that allows you to apply a
# specified function to the input data. The Function Transformer
# can be useful for performing custom transformations of input
# data in a machine learning pipeline.

In [None]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

In [None]:
# create a dataset
X = np.array([[1, 2], [3, 4]])

# define the transformation function
log_transform = FunctionTransformer(np.log1p)

# Apply the transformation to the dataset
X_transformed = log_transform.transform(X)

# View the transformed data
print(X_transformed)

[[0.69314718 1.09861229]
 [1.38629436 1.60943791]]


In [None]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X = np.array([[1, 2], [3, 4]])

# define a custom feature engineering
def squ(X):
  return np.hstack((X, X ** 2))

# create a FunctionTransformer to apply the custom function
custom_transformer = FunctionTransformer(squ)

# apply the transformer to the input data
X_transformed = custom_transformer.transform(X)

# view the transformed data
print(X_transformed)

[[ 1  2  1  4]
 [ 3  4  9 16]]


In [None]:
# Array range(21-50)  even numbers filter  --> function transofrmer --->

In [None]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X = np.arange(21, 51)

# custom feature engineering
def squ(X):
    X = X[X % 2 == 0]      # keep only even numbers
    return X

# FunctionTransformer
custom_transformer = FunctionTransformer(squ)

# apply transformation
X_transformed = custom_transformer.transform(X)

print(X_transformed)

[22 24 26 28 30 32 34 36 38 40 42 44 46 48 50]


In [None]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X = np.array([[1, 2], [3, 4]])

# define a custom scaling function
def my_scaling(X):
  return X / np.max(X)

# create a FunctionTransformer to apply the custom function
custom_transformer = FunctionTransformer(my_scaling)

# apply the transformer to the input data
X_transformed = custom_transformer.transform(X)

# view the transformed data
print(X_transformed)

[[0.25 0.5 ]
 [0.75 1.  ]]


In [None]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset with missing values
X = np.array([[1, 2], [3, np.nan]])

# define a custom cleaning function
def my_cleaning(X):
  X[np.isnan(X)] = 0
  return X

# create a FunctionTransformer to apply the custom function
custom_transformer = FunctionTransformer(my_cleaning)

# apply the transformer to the input data
X_transformed = custom_transformer.transform(X)

# view the transformed data
print(X_transformed)


[[1. 2.]
 [3. 0.]]


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/insurance - insurance.csv')

In [None]:
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [None]:
lb = LabelEncoder()

In [None]:
df['smoker'] = lb.fit_transform(df['smoker'])
df['sex'] = lb.fit_transform(df['sex'])
df['region'] = lb.fit_transform(df['region'])

In [None]:
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523


In [None]:
df.shape

(1338, 7)

In [None]:
x = df.drop(columns = ['charges'])
x.shape

(1338, 6)

In [None]:
y = df['charges']
y.shape

(1338,)

In [None]:
ft = FunctionTransformer(np.log1p)

a = ft.transform(x)

print(a)

           age       sex       bmi  children    smoker    region
0     2.995732  0.000000  3.363842  0.000000  0.693147  1.386294
1     2.944439  0.693147  3.548755  0.693147  0.000000  1.098612
2     3.367296  0.693147  3.526361  1.386294  0.000000  1.098612
3     3.526361  0.693147  3.165686  0.000000  0.000000  0.693147
4     3.496508  0.693147  3.397189  0.000000  0.000000  0.693147
...        ...       ...       ...       ...       ...       ...
1333  3.931826  0.693147  3.464798  1.386294  0.000000  0.693147
1334  2.944439  0.000000  3.494080  0.000000  0.000000  0.000000
1335  2.944439  0.000000  3.633631  0.000000  0.000000  1.098612
1336  3.091042  0.000000  3.288402  0.000000  0.000000  1.386294
1337  4.127134  0.000000  3.403528  0.000000  0.693147  0.693147

[1338 rows x 6 columns]


In [None]:
# -- covid data --> missing value --> categorical to numerical --> numpy.sqrt

In [None]:
import pandas as pd
df = pd.read_csv('/content/covid_toy - covid_toy.csv')

In [None]:
df.head(4)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
df['cough'] = lb.fit_transform(df['cough'])
df['gender'] = lb.fit_transform(df['gender'])
df['has_covid'] = lb.fit_transform(df['has_covid'])
df['city'] = lb.fit_transform(df['city'])

In [None]:
df.head(4)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,1,103.0,0,2,0
1,27,1,100.0,0,1,1
2,42,1,101.0,0,1,0
3,31,0,98.0,0,2,0


In [None]:
df.isna().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


In [None]:
df['fever'].fillna(df['fever'].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['fever'].fillna(df['fever'].mean(), inplace = True)


In [None]:
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,1,103.0,0,2,0
1,27,1,100.0,0,1,1
2,42,1,101.0,0,1,0


In [None]:
df.isna().sum()

Unnamed: 0,0
age,0
gender,0
fever,0
cough,0
city,0
has_covid,0


In [None]:
ft = FunctionTransformer(np.sqrt)

a = ft.transform(x)

print(a)

           age  sex       bmi  children  smoker    region
0     4.358899  0.0  5.282045  0.000000     1.0  1.732051
1     4.242641  1.0  5.811196  1.000000     0.0  1.414214
2     5.291503  1.0  5.744563  1.732051     0.0  1.414214
3     5.744563  1.0  4.764976  0.000000     0.0  1.000000
4     5.656854  1.0  5.374012  0.000000     0.0  1.000000
...        ...  ...       ...       ...     ...       ...
1333  7.071068  1.0  5.565070  1.732051     0.0  1.000000
1334  4.242641  0.0  5.649779  0.000000     0.0  0.000000
1335  4.242641  0.0  6.070420  0.000000     0.0  1.414214
1336  4.582576  0.0  5.079370  0.000000     0.0  1.732051
1337  7.810250  0.0  5.391660  0.000000     1.0  1.000000

[1338 rows x 6 columns]


In [None]:
np.sqrt(60)

np.float64(7.745966692414834)

In [None]:
b = ft.transform(df)
b

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,7.745967,1.0,10.148892,0.0,1.414214,0.0
1,5.196152,1.0,10.000000,0.0,1.000000,1.0
2,6.480741,1.0,10.049876,0.0,1.000000,0.0
3,5.567764,0.0,9.899495,0.0,1.414214,0.0
4,8.062258,0.0,10.049876,0.0,1.732051,0.0
...,...,...,...,...,...,...
95,3.464102,0.0,10.198039,0.0,0.000000,0.0
96,7.141428,0.0,10.049876,1.0,1.414214,1.0
97,4.472136,0.0,10.049876,0.0,0.000000,0.0
98,2.236068,0.0,9.899495,1.0,1.732051,0.0
