In [None]:
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:100% !important; }</style>"))

<br />
<img src="nethone.png" />
<br />

Created by: damian.mironiuk@nethone.com

# JUPYTER NOTEBOOK
```
$ jupyter notebook
```

##### Basic commands
* `Esc` - enter the "command mode" (notice: the margin turns blue)
* `Enter` - enter the "edition mode" (notice: the margin turns green)
* `Ctrl-enter` - execute current code cell
* `Shift-Enter` - execute current cell + (go to / create next) code cell 
* `dd` - delete cell (in command mode)
* `Z` - undo delete cell (in command mode)
* `Tab` - uzupełnienie kodu
* `a` - new code cell **a**bove
* `b` - new code cell **b**elow
* `Shift-Tab` - show signature + first line of the docstring
* `Shift-Tab-Tab` - show full docstring
* `<function>?` - docstring of the `<function>`
* `<function>??` - full code (!) of the `<function>`

For more, check out: `Help -> Keyboard shortcuts`

In [None]:
import math


math.sqrt(5)

# PANDAS
<br />
<img src="https://images-na.ssl-images-amazon.com/images/I/81S-3ziyseL.jpg" style="height: 400px" />
<br />

In [None]:
import pandas as pd

In [None]:
pd.DataFrame({
    'a': [1,2,3],
    'b': [4,5,6]   
})

### But first, a few words about `class`-es

In [None]:
# This is the smallest definition of a class possible
class Example:
    pass


e = Example()

In [None]:
# It's not very useful (we can add attributes and print them)
e.name = 'first example'
e.value = 1
print('name: "{}"'.format(e.name))
print('value: {}'.format(e.value))
# ... but it works

In [None]:
# Let's use a cleaner way of setting the attributes
# at initialization, i.e. let's add the __init__ method
class Example:
    def __init__(self, name, value):
        self.name = name
        self.value = value
        
        
e = Example('second example', 2)
print('name: "{}", value: {}'.format(e.name, e.value))

In [None]:
# OK, we're making some progress; let's add a method
# for printing out the attributes
class Example:
    def __init__(self, name, value):
        self.name = name
        self.value = value
        
    def print_out(self):
        print('name: "{}", value: {}'.format(self.name, self.value))
        
        
e = Example('third example', 3)
e.print_out()

In [None]:
# And the last example, with inheritance
class Example(pd.DataFrame):
    def say_hi(self):
        print("Hi, I'm an instance of a class inheriting from pd.DataFrame")
        

e = Example({
    'a': [1,2,3],
    'b': [4,5,6],
})
e.say_hi()
e

## `pd.DataFrame`
<img src="dataframe.jpg" /><br />

Source: https://stackoverflow.com/questions/25773245/ambiguity-in-pandas-dataframe-numpy-array-axis-definition

In [None]:
pd.DataFrame({
    'letter': list('abcd'),
    'number': [1, 2, 3, 4]
})

In [None]:
df = pd.DataFrame(
    [
        ['PL',  0.1,    11,   'user@gmail.com'],
        ['PL',  '0.2',  None, 'user@gmail.pl'],
        ['PL',  None,   33,   ''],
        ['RU',  1.5,    44,   'user@gmail.ru'],
    ],
    columns = ['country', 'mixed', 'age', 'email']
)
df

In [None]:
from datetime import date


df['date'] = date.today()
df

In [None]:
df['colors'] = ['red', 'green', 'blue', 'white']
df

In [None]:
# Column type
# NOTE: None vs NaN
print(df.dtypes)
df

In [None]:
import numpy as np


df.select_dtypes(include=[np.number])

In [None]:
df.select_dtypes(exclude=[np.number])

In [None]:
def print_type(obj):
    print('Type:', type(obj))

In [None]:
# Column selection
# __getitem__ with ONE column name --> pd.Series
letters = df['country']
print_type(letters)
letters

In [None]:
# Column selection
# __getitem__ with a LIST of column names --> pd.DataFrame
columns = ['country']  # There can be more elements in the list
letters = df[columns]
print_type(letters)
letters

In [None]:
# Row selection
selector = (df['country'] == 'PL')
print_type(selector)
selector

In [None]:
# __getitem__ with a boolean array or pd.Series
df[selector]

## `loc[index, column(s)]`

In [None]:
# ":" - in this context, this means "whole range" (or: "all elements")
row_1 = df.loc[1, :]
print_type(row_1)
row_1

In [None]:
# NOTE: if the `index` is a list, the returned type is different
row_1 = df.loc[[1], :]
print_type(row_1)
row_1

In [None]:
df.loc[:, 'age']
df.loc[0, 'age']
df.loc[:, ['age']]
df.loc[[0, 1], ['age']]
df.loc[0:2, :]  # WATCH OUT!

## `iloc[index(es), column index(es)]`

In [None]:
df.index = range(10, 14)
df

In [None]:
# This works fine ...
df.iloc[0, :]

In [None]:
# ... but this doesn't `0`
df.loc[0, :]

In [None]:
df.iloc[[0], :]

In [None]:
df.iloc[:, 1]

In [None]:
df.iloc[:, [1]]

In [None]:
# Range: loc vs iloc
print('loc')
print(df.loc[10:12])  # INCLUDING row with index 12
print()

print('iloc')
print(df.iloc[0:2])  # EXCLUDING row with index 12
print()

## One more note about row selection

In [None]:
# You might use boolean selections but sometimes it's not very "clean"
df[(df['country'] == 'PL') & (df['age'] == 33)]

In [None]:
# Queries seem better (but...)
df.query('country == "PL" and age == 33')

In [None]:
# Queries with "external" variables
country = 'RU'
age = 44
df.query('country == @country and age == @age')

## Operations on columns containing strings

In [None]:
df['email'].str.contains('@')

In [None]:
# Negation
~df['email'].str.contains('@')

In [None]:
df['country'].str.replace('L', 'T')

In [None]:
df['email'].str.split('@')

In [None]:
df['email'].str.split('@', expand=True)

In [None]:
# df['email'].str.<Tab>  -->  this won't work
pd.Series.str.

In [None]:
# Metoda apply - domyślnie nie modyfikuje oryginalnej ramki
counter = 0
def add_values(column):
    global counter
    counter += 1
    return column + column


print('Wynik apply:')
print(df.select_dtypes(include=[np.number]).apply(add_values))
print()
print('Oryginalny df:')
df

In [None]:
print(counter)

In [None]:
df.select_dtypes(include=[np.number]) * 2

In [None]:
def compare_country_and_domain(row):
    email_parts = row['email'].split('@')
    if not email_parts:
        return False
    
    email_domain = email_parts[-1]
    return email_domain.endswith(row['country'].lower())


df.apply(compare_country_and_domain, axis=1)

In [None]:
# `apply` on pd.Series
countries = df['country']

upper_countries = countries.apply(str.lower)
upper_countries

In [None]:
# `applymap` on pd.DataFrame
def double_it(value):
    return 2 * value if pd.notnull(value) else value


ddf = df[['mixed', 'age']].applymap(double_it)
ddf

# SCI-KIT LEARN

### Transformers


```python
class SomeTransformer(TransformerMixin):

    def __init__(self, argument, another_one):
        # Do what you have to do to create an "instance"
        # of this class (a particular transformer)

    def fit(self, X, y=None, **fit_params):
        # ...
        # look at the data and prepare to transform
        # ...
        return self
        
    def transform(self, X, y=None, **fit_params):
        # ...
        # modify X and/or return modified columns
        # but either way:
        return X_transformed
```

In [None]:
df

In [None]:
from sklearn.base import TransformerMixin


class EmailDomainExtractor(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_transform):
        return X.applymap(self._extract_domain)
    
    def _extract_domain(self, email):
        if pd.isnull(email) or not '@' in email:
            return None
        return email.split('@')[-1]


trans = EmailDomainExtractor()
trans.transform(df[['email', 'country']])

In [None]:
# Wrapper on functions
from sklearn.preprocessing import FunctionTransformer


temp_df = pd.DataFrame({
    'temperature': [-10, -11, 13, 24, 0]
})

transformer = FunctionTransformer(lambda x: x + 273)
result = transformer.fit_transform(temp_df)

In [None]:
# Transformer (with "state")
from collections import Counter
from sklearn.base import TransformerMixin


class CounterTransformer(TransformerMixin):
    def __init__(self):
        self._counts = None
        
    def fit(self, X):
        self._counts = X.value_counts()
        return self
    
    def transform(self, X):
        # Doesn't support new levels!
        return X.apply(lambda v: self._counts[v])

In [None]:
print('Train:')
df_train = pd.DataFrame({
    'animal': 'cat dog racoon cat cat'.split(),
})
print(df_train)
print()

transformer = CounterTransformer()
# `fit_transform` is defined in TransformerMixin
result = transformer.fit_transform(df_train['animal'])
print(result)
print()


print('Test:')
df_test = pd.DataFrame({
    'animal': 'cat dog'.split(),
})
print(df_test)
print()

result = transformer.transform(df_test['animal'])
print(result)

In [None]:
class DFCounterTransformer(TransformerMixin):
    def __init__(self):
        self._transformers = {}
    
    def fit(self, X):
        self._transformers = {
            col: CounterTransformer().fit(X[col])
            for col in X.columns
        }
        return self
    
    def transform(self, X):
        return pd.concat([
            self._transformers[col].transform(X[col]) for col in X.columns
        ], axis=1)


print('Train:')
df_train = pd.DataFrame({
    'animal': 'cat dog racoon cat'.split(),
    'number': [1, 1, 1, 2],
})
print(df_train)
print()

transformer = DFCounterTransformer()
result = transformer.fit_transform(df_train)
print(result)


print('Test:')
df_test = pd.DataFrame({
    'animal': 'cat dog'.split(),
    'number': [1, 2],
})
print(df_test)
print()

result = transformer.transform(df_test)
result

# Pipeline

In [None]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder


def fill_na(v):
    return 0 if pd.isnull(v) else v


fill_na_trans = FunctionTransformer(
    lambda df: df.applymap(fill_na),
    validate=False
)
ohe_trans = OneHotEncoder(handle_unknown='ignore')


print('Train:')
df_train = pd.DataFrame({
    'animal': [1, 2, 3, 1, None]
})
print(df_train)
print()


x = fill_na_trans.fit_transform(df_train)
ohe_trans.fit_transform(x)


print('Test:')
df_test = pd.DataFrame({
    'animal': [1, 4, None]
})
print(df_test)
print()


x = fill_na_trans.transform(df_test)
x = ohe_trans.transform(x)
print(type(x))

x.toarray()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder


def fill_na(v):
    return 0 if pd.isnull(v) else v


pipeline = Pipeline([
    ('FillNA', FunctionTransformer(lambda df: df.applymap(fill_na), validate=False)),
    ('OHE', OneHotEncoder(handle_unknown='ignore')),
])


df_train = pd.DataFrame({
    'animal': [1, 2, 3, 1, None]
})
pipeline.fit(df_train)


df_test = pd.DataFrame({
    'animal': [1, 4, None]
})
pipeline.transform(df_test).toarray()

In [None]:
df = pd.DataFrame({
    'animal': 'cat dog frog'.split(),
    'level': [-10, np.NaN, 13],
    'user_domain': ['www.google.pl', 'www.www2.wp.com.pl', None],
    'server_domain': ['www.google.pl', 'www.www2.wp.com.pl', None],
})

df

In [None]:
numeric_pipeline = Pipeline([
    ('SelectNumeric', FunctionTransformer(lambda df: df.select_dtypes(include=[np.number]), validate=False)),
    ('FillNA', FunctionTransformer(lambda df: df.fillna(0), validate=False)),
    ('Abs', FunctionTransformer(lambda df: df.applymap(abs), validate=False)),
])

numeric_pipeline.fit_transform(df)