In [1]:
"""
importing libraries 
importing libary is done by using:
  - "import libname" or "import libname as lib"
then you can use lib.func to access the specific function you want. 
if you want to import just one function:
    - "from libname import func"
    - DO NOT USE "from libname import *" as this might have unwanted consequences

"""
# good 
import pandas as pd
# bad:
# from pandas import *
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import FunctionTransformer




In [2]:
#%%
# =============================================================================
# read data from csv and basic pandas commands
# =============================================================================

data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data',
                  delim_whitespace=True,
                  names = ['mpg',           # continuous
                          'cylinders',     # multi-valued discrete
                          'displacement',  # continuous
                          'horsepower',    # continuous
                          'weight',        # continuous
                          'acceleration',  # continuous
                          'model_year',    # multi-valued discrete
                          'origin',        # multi-valued discrete
                          'name',          #string (unique for each instance))
                          ],
                  na_values = '?',
                  )



In [3]:
#%%
# you can select one column such as 
data['mpg']
# or 
data.loc[:, 'mpg']
# several columns as
data[['mpg','horsepower']]

data.loc[:, ['mpg','horsepower']]




Unnamed: 0,mpg,horsepower
0,18.0,130.0
1,15.0,165.0
2,18.0,150.0
3,16.0,150.0
4,17.0,140.0
...,...,...
393,27.0,86.0
394,44.0,52.0
395,32.0,84.0
396,28.0,79.0


In [4]:
# select using boolean mask 
mask = np.random.choice([True, False], size=len(data), p=[0.05, 0.95])
data.loc[mask, ['mpg','horsepower']]



Unnamed: 0,mpg,horsepower
22,25.0,95.0
70,13.0,190.0
83,28.0,80.0
119,20.0,91.0
139,14.0,140.0
155,15.0,72.0
161,16.0,105.0
170,23.0,78.0
172,25.0,71.0
225,17.5,110.0


In [5]:
# or more useful
mask = data.isna().any(axis=1)
data.loc[mask, :]



Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
32,25.0,4,98.0,,2046.0,19.0,71,1,ford pinto
126,21.0,6,200.0,,2875.0,17.0,74,1,ford maverick
330,40.9,4,85.0,,1835.0,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,,2905.0,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,,2320.0,15.8,81,2,renault 18i
374,23.0,4,151.0,,3035.0,20.5,82,1,amc concord dl


In [6]:
#%%
# =============================================================================
# Vectorization in python (do not write for loops)
# https://www.oreilly.com/library/view/python-for-data/9781449323592/ch04.html
# =============================================================================
array = np.random.randint(1,10, size = 10000)
#%%
%%timeit -n 100 avg = 0
for i in array:
    avg += i
avg = avg/10000



UsageError: Line magic function `%%timeit` not found.


In [16]:


#%%
dum_data = pd.DataFrame([[-1, 2, 'a'], [-0.5, 6, 'a'], [0, 10, 'b'], [1, 18, 'b']], columns = ['a','b','c'])
#np.array([1,2,3,4])



Unnamed: 0,a,b,c
0,-1.0,2,a
1,-0.5,6,a
2,0.0,10,b
3,1.0,18,b


In [8]:
#%%
# =============================================================================
# data preparation/feature engineering 
# =============================================================================
# lets standartize the data https://en.wikipedia.org/wiki/Feature_scaling

X = dum_data[['a','b']].copy()
X = X - X.mean()
X = X/X.std()



In [9]:
#%% 
## or do this in a function 

def standartize(X):
    return (X - X.mean() )/ X.std()
X = standartize(dum_data[['a','b']].copy())
print(X)



          a         b
0 -1.024695 -1.024695
1 -0.439155 -0.439155
2  0.146385  0.146385
3  1.317465  1.317465


In [10]:
#%%
# Other basic feature engineering 

X = dum_data[['a','b']].copy()
skscaler = StandardScaler()
skscaler.fit_transform(X)
poly = PolynomialFeatures(include_bias = False)
poly.fit_transform(X)


array([[-1.00e+00,  2.00e+00,  1.00e+00, -2.00e+00,  4.00e+00],
       [-5.00e-01,  6.00e+00,  2.50e-01, -3.00e+00,  3.60e+01],
       [ 0.00e+00,  1.00e+01,  0.00e+00,  0.00e+00,  1.00e+02],
       [ 1.00e+00,  1.80e+01,  1.00e+00,  1.80e+01,  3.24e+02]])

In [11]:
#%%
# =============================================================================
# use custom function
# =============================================================================
# from sklearn.preprocessing import FunctionTransformer

funtran = FunctionTransformer(func = np.exp, inverse_func = np.log)

funtran.inverse_transform( funtran.fit_transform(X) )




Unnamed: 0,a,b
0,-1.0,2.0
1,-0.5,6.0
2,0.0,10.0
3,1.0,18.0


In [18]:
#%% Lets create transformers ourselves
# or what to do if we need something that is not in sklearn
from sklearn.utils import check_array
from sklearn.base import TransformerMixin, BaseEstimator

# classes and interfaces for more info:
# https://scikit-learn.org/stable/developers/develop.html
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
# Custom transformers

def checkNA(method):
    def wrapper(self, X):
        if np.isnan(X).any():
            raise Exception("There are missing values in the data")
        else:
            method(self, X)
    return wrapper




In [19]:
#%%
# Feature union use case 
baby_pipe = FeatureUnion([
                        ('numerical',
                        Pipeline([
                            ('select_num', FunctionTransformer(func = lambda X: X.loc[:, ['a','b']])),
                            ('poly',       PolynomialFeatures(include_bias = False)),
                            ('scaler',     StandardScaler()),
                                ])
                        ),
                         ('categorical', 
                          Pipeline([
                              ('pass_cat', FunctionTransformer(func = lambda X: X.loc[:, ['c']]) ),
                              ('onehot',   OneHotEncoder(sparse = False, handle_unknown='ignore') )
                        ] ) 
                         ),
                    ])
                        
baby_pipe.fit_transform(dum_data)

#%%
super_pipe = Pipeline([ ('baby_pipe', baby_pipe),
                        ('model', DummyRegressor() )
                        ])
super_pipe.fit(dum_data, y)
super_pipe.predict(dum_data)

# =============================================================================
# Big task 
# create train and test pipeline for your project
# create benchmark - w dummyregressor
# create some other model and 
# =============================================================================
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

NameError: name 'OneHotEncoder' is not defined

In [20]:
class CustomStandardScaler(TransformerMixin, BaseEstimator):
    
    # @checkNA
    def fit(self, X, y=None):
        X = self._validate_data(X, estimator = self)
        self.means = np.mean(X, axis = 0)
        self.vars  = np.var(X, axis=0)
        self.scale = np.sqrt(self.vars)
        
        return self
    
    def transform(self, X):
        X = self._validate_data(X, estimator = self)
        X = X - self.means
        X = X/self.scale
        return X


scaler = CustomStandardScaler()
scaler.fit_transform(X)
#%%
# =============================================================================
# crate custom minmaxscaler, custom mean imputer.. custom whatever you need.
# list of available preprocesing 
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
# =============================================================================

class CustomTranformer(): #(TransformerMixin, BaseEstimator):
    
    def fit(X, y=None):
        pass
    
    def transform(X):
        pass
    

#%%
# =============================================================================
# Sklearn transfomers as part of pipeline
# =============================================================================

pipe = Pipeline([ ('poly', PolynomialFeatures(include_bias = False)),
                  ('scaler', StandardScaler()),
                   ('model', DummyRegressor()),
                ]
                )
# pipe.fit_transform(X)
pipe.fit(X, y)
pipe.predict(X)

#%% 
# Onehot encoder transformer 
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse = False, handle_unknown='ignore')
onehot.fit_transform(dum_data[['c']])


AttributeError: 'CustomStandardScaler' object has no attribute '_validate_data'