## Importing libraries

importing libary is done by using:
  - "import libname" or "import libname as lib"
then you can use lib.func to access the specific function you want. 
if you want to import just one function:
    - "from libname import func"
    - DO NOT USE "from libname import *" as this might have unwanted consequences

In [1]:
# good 
import pandas as pd
# bad:
# from pandas import *
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import FunctionTransformer

import sklearn

import timeit
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Read data from csv and basic pandas commands

In [3]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data',
                  delim_whitespace=True,
                  names = ['mpg',           # continuous
                          'cylinders',     # multi-valued discrete
                          'displacement',  # continuous
                          'horsepower',    # continuous
                          'weight',        # continuous
                          'acceleration',  # continuous
                          'model_year',    # multi-valued discrete
                          'origin',        # multi-valued discrete
                          'name',          #string (unique for each instance))
                          ],
                  na_values = '?',
                  )

In [4]:
# you can select one column such as 
data['mpg']
# or 
data.loc[:, 'mpg']

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
5      15.0
6      14.0
7      14.0
8      14.0
9      15.0
10     15.0
11     14.0
12     15.0
13     14.0
14     24.0
15     22.0
16     18.0
17     21.0
18     27.0
19     26.0
20     25.0
21     24.0
22     25.0
23     26.0
24     21.0
25     10.0
26     10.0
27     11.0
28      9.0
29     27.0
       ... 
368    27.0
369    34.0
370    31.0
371    29.0
372    27.0
373    24.0
374    23.0
375    36.0
376    37.0
377    31.0
378    38.0
379    36.0
380    36.0
381    36.0
382    34.0
383    38.0
384    32.0
385    38.0
386    25.0
387    38.0
388    26.0
389    22.0
390    32.0
391    36.0
392    27.0
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
5      15.0
6      14.0
7      14.0
8      14.0
9      15.0
10     15.0
11     14.0
12     15.0
13     14.0
14     24.0
15     22.0
16     18.0
17     21.0
18     27.0
19     26.0
20     25.0
21     24.0
22     25.0
23     26.0
24     21.0
25     10.0
26     10.0
27     11.0
28      9.0
29     27.0
       ... 
368    27.0
369    34.0
370    31.0
371    29.0
372    27.0
373    24.0
374    23.0
375    36.0
376    37.0
377    31.0
378    38.0
379    36.0
380    36.0
381    36.0
382    34.0
383    38.0
384    32.0
385    38.0
386    25.0
387    38.0
388    26.0
389    22.0
390    32.0
391    36.0
392    27.0
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

In [5]:
# several columns as
data[['mpg','horsepower']]

data.loc[:, ['mpg','horsepower']]

Unnamed: 0,mpg,horsepower
0,18.0,130.0
1,15.0,165.0
2,18.0,150.0
3,16.0,150.0
4,17.0,140.0
5,15.0,198.0
6,14.0,220.0
7,14.0,215.0
8,14.0,225.0
9,15.0,190.0


Unnamed: 0,mpg,horsepower
0,18.0,130.0
1,15.0,165.0
2,18.0,150.0
3,16.0,150.0
4,17.0,140.0
5,15.0,198.0
6,14.0,220.0
7,14.0,215.0
8,14.0,225.0
9,15.0,190.0


In [6]:
# select using boolean mask 
mask = np.random.choice([True, False], size=len(data), p=[0.05, 0.95])
data.loc[mask, ['mpg','horsepower']]

Unnamed: 0,mpg,horsepower
4,17.0,140.0
12,15.0,150.0
19,26.0,46.0
38,14.0,165.0
78,21.0,87.0
91,13.0,150.0
96,13.0,175.0
119,20.0,91.0
128,15.0,100.0
130,26.0,80.0


In [7]:
# or more useful
mask = data.isna().any(axis=1)
data.loc[mask, :]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
32,25.0,4,98.0,,2046.0,19.0,71,1,ford pinto
126,21.0,6,200.0,,2875.0,17.0,74,1,ford maverick
330,40.9,4,85.0,,1835.0,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,,2905.0,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,,2320.0,15.8,81,2,renault 18i
374,23.0,4,151.0,,3035.0,20.5,82,1,amc concord dl


## Vectorization in python (do not write for loops)
https://www.oreilly.com/library/view/python-for-data/9781449323592/ch04.html


In [8]:
array = np.random.randint(1,10, size = 10000)


In [9]:
%%timeit -n 100 avg = 0
for i in array:
    avg += i
avg = avg/10000

2.48 ms ± 307 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%timeit -n 100 np.mean(array)
    

20.3 µs ± 6.48 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
dum_data = pd.DataFrame([[-1, 2, 'a'], [-0.5, 6, 'a'], [0, 10, 'b'], [1, 18, 'b']], columns = ['a','b','c'])
y = np.array([1,2,3,4])

## Data preparation/feature engineering 
Lets standartize the data https://en.wikipedia.org/wiki/Feature_scaling

In [12]:
X = dum_data[['a','b']].copy()
X = X - X.mean()
X = X/X.std()

In [13]:
## or do this in a function 
def standartize(X):
    return (X - X.mean() )/ X.std()
X = standartize(dum_data[['a','b']].copy())
print(X)

          a         b
0 -1.024695 -1.024695
1 -0.439155 -0.439155
2  0.146385  0.146385
3  1.317465  1.317465


In [14]:
# Other basic feature engineering 
X = dum_data[['a','b']].copy()
skscaler = StandardScaler()

skscaler.fit(X)
skscaler.transform(X)

skscaler.fit_transform(X)

StandardScaler()

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [15]:
X = dum_data[['a','b']].copy()
poly = PolynomialFeatures(include_bias = False)
poly.fit_transform(X)

array([[-1.00e+00,  2.00e+00,  1.00e+00, -2.00e+00,  4.00e+00],
       [-5.00e-01,  6.00e+00,  2.50e-01, -3.00e+00,  3.60e+01],
       [ 0.00e+00,  1.00e+01,  0.00e+00,  0.00e+00,  1.00e+02],
       [ 1.00e+00,  1.80e+01,  1.00e+00,  1.80e+01,  3.24e+02]])

## Use custom function

In [16]:
# from sklearn.preprocessing import FunctionTransformer

funtran = FunctionTransformer(func = np.exp, inverse_func = np.log)

funtran.transform(X)
funtran.inverse_transform( funtran.fit_transform(X) )

Unnamed: 0,a,b
0,0.367879,7.389056
1,0.606531,403.4288
2,1.0,22026.47
3,2.718282,65659970.0


Unnamed: 0,a,b
0,-1.0,2.0
1,-0.5,6.0
2,0.0,10.0
3,1.0,18.0


In [17]:
# Lets create transformers ourselves
# or what to do if we need something that is not in sklearn
from sklearn.utils import check_array
from sklearn.base import TransformerMixin, BaseEstimator

classes and interfaces for more info:  
https://scikit-learn.org/stable/developers/develop.html  
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing  

In [18]:
# Custom transformers

def checkNA(method):
    def wrapper(self, X):
        if np.isnan(X).any():
            raise Exception("There are missing values in the data")
        else:
            method(self, X)
    return wrapper

In [19]:
class CustomStandardScaler(TransformerMixin, BaseEstimator):
    # @checkNA
    def fit(self, X, y=None):
        X = self._validate_data(X, estimator = self)
        self.means = np.mean(X, axis = 0)
        self.vars  = np.var(X, axis=0)
        self.scale = np.sqrt(self.vars)
        
        return self
    
    def transform(self, X):
        X = self._validate_data(X, estimator = self)
        X = X - self.means
        X = X/self.scale
        return X

In [20]:
scaler = CustomStandardScaler()
scaler.fit_transform(X)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

## Crate custom minmaxscaler, custom mean imputer.. custom whatever you need.
List of available preprocesing 
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing


In [22]:
class CustomTranformer(): #(TransformerMixin, BaseEstimator):
    
    def fit(X, y=None):
        pass
    
    def transform(X):
        pass
    

## Sklearn transfomers as part of pipeline

In [23]:
pipe = Pipeline([ ('poly', PolynomialFeatures(include_bias = False)),
                  ('scaler', StandardScaler()),
                   ('model', DummyRegressor()),
                ]
                )
# pipe.fit_transform(X)
pipe.fit(X, y)
pipe.predict(X)

Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)),
                ('scaler', StandardScaler()), ('model', DummyRegressor())])

array([2.5, 2.5, 2.5, 2.5])

In [24]:
# Onehot encoder transformer 
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse = False, handle_unknown='ignore')
onehot.fit_transform(dum_data[['c']])

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [25]:
# Feature union use case 
baby_pipe = FeatureUnion([
                        ('numerical',
                        Pipeline([
                            ('select_num', FunctionTransformer(func = lambda X: X.loc[:, ['a','b']])),
                            ('poly',       PolynomialFeatures(include_bias = False)),
                            ('scaler',     StandardScaler()),
                                ])
                        ),
                         ('categorical', 
                          Pipeline([
                              ('pass_cat', FunctionTransformer(func = lambda X: X.loc[:, ['c']]) ),
                              ('onehot',   OneHotEncoder(sparse = False, handle_unknown='ignore') )
                        ] ) 
                         ),
                    ])
                        
baby_pipe.fit_transform(dum_data)

array([[-1.18321596, -1.18321596,  0.98019606, -0.61159284, -0.89625816,
         1.        ,  0.        ],
       [-0.50709255, -0.50709255, -0.70014004, -0.72808671, -0.6401844 ,
         1.        ,  0.        ],
       [ 0.16903085,  0.16903085, -1.26025208, -0.37860509, -0.12803688,
         0.        ,  1.        ],
       [ 1.52127766,  1.52127766,  0.98019606,  1.71828464,  1.66447944,
         0.        ,  1.        ]])

In [26]:
super_pipe = Pipeline([ ('baby_pipe', baby_pipe),
                        ('model', DummyRegressor() )
                        ])
super_pipe.fit(dum_data, y)
super_pipe.predict(dum_data)

Pipeline(steps=[('baby_pipe',
                 FeatureUnion(transformer_list=[('numerical',
                                                 Pipeline(steps=[('select_num',
                                                                  FunctionTransformer(func=<function <lambda> at 0x000001C9CF78AAF8>)),
                                                                 ('poly',
                                                                  PolynomialFeatures(include_bias=False)),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('categorical',
                                                 Pipeline(steps=[('pass_cat',
                                                                  FunctionTransformer(func=<function <lambda> at 0x000001C9CF78A8B8>)),
                                                                 ('on

array([2.5, 2.5, 2.5, 2.5])

## Big task 
* Create train and test pipeline for your project
* Create benchmark - w dummyregressor
* Create some other model and 


In [27]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)