# Libraries

In [11]:
# Every day
import pandas as pd
import numpy as np
import os
os.chdir('C:/Users/istrazov/Documents')
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns

# String
import re # RegEx

# Time series
import time
import datetime

# File format
import csv
import json

# Model
import statsmodels
import sklearn

# Neural Network
import torch

# Computer vision
import skimage
import pylab

# Specific
import sympy as sp
import pandas_profiling # Cool pandas data report
import pymssql # MS SQL
import pipenv # Virtual enviroment
from tqdm import tqdm # Progress bar
import import_ipynb # Import .ipynb

'0.9.0'

# Session settings

In [None]:
# Default size of graph
from matplotlib import rcParams
rcParams['figure.figsize'] = 8, 5

# Stopping Warnings
import warnings
warnings.simplefilter('ignore')

# Pandas settings
display_settings = {
    'max_columns': 500,
    'expand_frame_repr': True,
    'max_rows': 500,
    'width': 1000,
    'precision': 2,
    'show_dimensions': True
}

for op, value in display_settings.items():
    pd.set_option("display.{}".format(op), value)

# PIP

- pip help - помощь по доступным командам.

- pip install package_name - установка пакета(ов).

- pip uninstall package_name - удаление пакета(ов).

- pip list - список установленных пакетов.

- pip show package_name - показывает информацию об установленном пакете.

- pip search - поиск пакетов по имени.

- pip --proxy user:passwd@proxy.server:port - использование с прокси.

- pip install -U - обновление пакета(ов).

- pip install --force-reinstall - при обновлении, переустановить пакет, даже если он последней версии.

# Downloading data

### Classic

In [None]:
with open('file_name', 'r') as file:
    f.read()
    f.write()

- r (read) - открыть для чтения (по уммолчанию)
- w (write) - открыть для записи, содержимое файла стирается
- a (append) - открыть для записи, запись ведется в конец
- b (binary) - открыть в бинарном виде
- t (text) - открыть в текстовом режиме (по умолчанию)
- r+ - открыть для чтения и записи
- w+ - откртыь для чтения и записи, содержимое файла стирается

### Pandas functions

In [None]:
pd.read_csv() # Разделитель ','

pd.read_table() # Разделитель '\t'

pd.read_fwf() # Фиксированная ширина столбцев

pd.read_clipboard() # Данные из буфера обмена

pd.read_excel() # Excel file



# Разделитель по столбцам
sep = {',', ';', '\t', '\n', '', '|', '\s+'}

# Разделитель дробных чисел
decimal = {'.', ','}
thousands = {'.', ','}

# Кодировка
encoding = {'utf-8', 'cp1251', 'cp855', 'cp866', 'koi8-r', 'iso8859_5', 'mac_cyrillic'}
#https://docs.python.org/3/library/codecs.html#standard-encodings

# Заголовки столбцев
header = {True, False}

# Названия столбцев
names = ['col1', 'col2']

# Столбец индексов
index_col = {0, "name"}

# Пропуск строк
skiprows = []

# Обозначение NA в файле
na_values = ['NULL']

# Обозначение комментария в файле
comment = ['--', '#']

# Ковертер столбцев по функции
converters = {'foo': 'f'}

# Количество читаемых строк в файле
nrows = 20

# Список используемых столбцев
usecols = [0, 1, 5]

### Custom functions

In [None]:
import pandas as pd, numpy as np

def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type) == 'bool':
                continue
            elif str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file, sep=',', decimal=',', header='infer', index_col=None, report=False):
    """
    create a dataframe & report and optimize its memory usage
    """
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, sep=sep, decimal=decimal, header=header, index_col=index_col)
    df = reduce_mem_usage(df)
    
    if report == True:
        display(df.head())
        print('Shape', end='\n\n')
        display(df.shape)
        print('\nTypes', end='\n\n')
        display(df.dtypes)

        if df.isnull().sum().sum() > 0:
            print('\nNaN', end='\n\n')
            display(df.isnull().sum())
        else:
            print('\nNaN\n-')
    
    return df

# Реорганизация данных

### Sorting

In [None]:
df.sort_values()

# Name or list of names to sort by
by = ['col1', 'col2'] 

# Choice of sorting algorithm
kind = {'quicksort', 'mergesort', 'heapsort'} 

# Perform operation in-place
inplace = {False, True} 

# Axis to be sorted
axis = {0, 1} 

# Sort ascending vs. descending
ascending = {True, False} 

# Puts NaNs at the beginning or at the end
na_position = {'first', 'last'} 

# Transforming data

### NaN

In [None]:
df.isnull() # == NaN

df.isnull().sum(axis=0) # Columns or string amount of NaN

df.isnull().sum().sum() # All amount of NaN

df.notnull()

In [None]:
df.col1[df.col1.notnull()]

df.dropna(how = 'all' # Удаляет только если все значения в строке/столбце NaN
          how = 'any' # Удаляет если хотя бы одно значение в строке/столбце NaN
          thresh = 5 # Удаляет если 5 или больше значений в строке/столбце NaN
          axis = 1 # Удаляем столбцы
          )

### Indices

In [None]:
df.set_index('col_name') # Column to index
df.reset_index() # Index to column

df.to_records() # DataFrame to array 

df.wide_to_long()

### Duplicates

In [None]:
df.duplicated()

df.drop_duplicates()

# Ключ дублирования
['col1', 'col2']

# Оставляемый дубликат
keep = {'last'}

# Замена датафрейма
inplace = {True, False}

### Digits to intervals

In [4]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

#### pandas

In [5]:
pd.cut(ages, bins)

#### numpy

In [None]:
np.digitize(x, bins, right=False)

# NumPy

In [None]:
import numpy as np

### Creation arrays

In [None]:
np.ones() # array of 0
np.zeros() # array of 1
np.eye() # unit matrix

In [None]:
np.arange()
np.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0) # function returns evenly spaced numbers over a specified interval
# function has two arguments: array x, array of bins, returning the indices of the bins to which each value in x
np.repeat(a, repeats, axis=None) # function repeats the elements of an array

### np.random

In [None]:
np.random.randint(low, high=None, size=None, dtype='l')
np.random.choice(a, size=None, replace=True, p=None)
np.random.binomial(n, p, size=None) # binomial distribution

### Polynomial

In [None]:
np.polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False) # function outputs a polynomial of degree deg that fits the points (x,y), minimizing the square error

In [None]:
np.polyval(p, x) # function evaluates a polynomial at specific values

### Arg-functions

In [None]:
np.argmax(a, axis=None, out=None) # return the index of max element
np.argmin(a, axis=None, out=None) # return the index of min element
np.argsort() # return the indices that would sort an array

### Histogram

In [None]:
np.histogram(a, bins=10, range=None, normed=None, weights=None, density=None) # computes the histogram of a set of data; returns frequency count and bin edges

# YuoTube

In [None]:
from IPython.display import YouTubeVideo

YouTubeVideo('xe_ATRmw0KM')

# Profile (pandas report)

In [9]:
# !pip install pandas_profiling

import pandas as pd
import pandas_profiling

pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/planets.csv').profile_report()

# Matplotlib

### color

In [None]:
'k' - black
'g' - green
'b' - blue
'r' - red

### linestyle

In [None]:
'-' - dashes
's' - squares
'^' - triangles

### marker

In [None]:
'o'
'+'
','
'.'
'1'

### drawstyle

In [None]:
steps-post

# SimpleTable

In [48]:
from statsmodels.iolib.table import SimpleTable

row =  ['JB', 'p-value', 'skew', 'kurtosis']
a = [['1.5', '2.5', '3.5', '4.5']]

print(SimpleTable(a, row))

 JB p-value skew kurtosis
-------------------------
1.5     2.5  3.5      4.5
-------------------------


# Regular Expression

. ^ $ * + ? { } [ ] \ | ( ) — метасимволы

[ ] — можно указать множество подходящих символов

^ - карет, обозначает либо начало строки, либо инвертирование группы символов. (например: "^[^0-9]" — не-цифра в начале строки).

\d ~ [0-9] — цифры

\D ~ [^0-9]

\s ~ [ \t\n\r\f\v] — пробельные символы

\S ~ [^ \t\n\r\f\v]

\w ~ [a-zA-Z0-9_] — буквы + цифры + _

\W ~ [^a-zA-Z0-9_]

In [None]:
re.IGNORECASE
re.DEBUG

# Pipenv (virtual enviroment)

In [None]:
Command line:
> '~/python.exe' -m venv env_name
> cd C:\env_name\Scripts
> activate

# Telegram Bot

In [None]:
# Installing
pip install telegram-send

# Connecting
telegram-send --configure
944911099:AAHRdFNB1D3Kd20ejF4d9F2x7byU943RhHU

# Sending e-mail
!telegram-send 'Hello_world!'

# Datetime

In [1]:
from datetime import datetime

### Now time

In [2]:
datetime.now()

datetime.datetime(2019, 11, 29, 18, 41, 4, 387667)

### Object timedelta

In [3]:
datetime.now() - datetime(2019, 11, 10)

datetime.timedelta(days=19, seconds=67265, microseconds=235849)

In [5]:
# Перевод из строки
print(datetime.strptime('2019-11-22', '%Y-%m-%d'))

# Перевод в строковый формат

print(str(datetime.now()))
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

2019-11-22 00:00:00
2019-11-29 18:41:11.707317
2019-11-29 18:41:11


## Спецификаторы datetime

- %Y - год(4)
- %y - год(2)
- %m - месяц
- %d - день
- %H - час(24)
- %h - час(12)
- %M - минута
- %S - секунда
- %w - номер дня недели
- %U - номер недели (с воскресенья)
- %W - номер недели (с понедельника)
- %z - часовой пояс
- %F - %Y-%m-%d
- %D - %m/%d/%y

## Спецификаторы, зависящие от локали

- %a - сокращенное название дня недели
- %A - полное название дня недели
- %b - сокращенное название месяца
- %B - полное название месяца
- %c - полная дата и время
- %p - AM/PM
- %x - дата в формате локали
- %X - время в формате локали

# Computer Vision

### Downloading image

In [None]:
from skimage.io import imread
import pylab

image = imread('image.jpg') # Real image to numpy array with shape=(m, n, 3)
pylab.imshow(image); # Show image

image = sk.img_as_float(image) # RGB to [0,1]

In [None]:
# Image-array (m,n,3) to X-vector (m*n,3) and y-vector (m*n)

X = image.reshape(image.shape[0]*image.shape[1], 3)
y = np.arange(1, image.shape[0]*image.shape[1]+1)

### Decrease amount of colours with k-Means

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=10, init='k-means++', random_state=241).fit(X)
    
data = pd.DataFrame(X, columns=['x','y','z'])
data['cluster'] = km.predict(X)

# Mean cluster pixel
data_mean = data.groupby('cluster', as_index=False).mean()
image_mean = np.array(data.merge(data_mean, on='cluster', how='left')[['x_y','y_y','z_y']])
image_mean = image_mean.reshape(image.shape[0], image.shape[1], 3)
print('PSNR_mean:', compare_psnr(image, image_mean))
pylab.imshow(image_mean);

# Median cluster pixel    
data_med = data.groupby('cluster', as_index=False).median()
image_med = np.array(data.merge(data_med, on='cluster', how='left')[['x_y','y_y','z_y']])
image_med = image_med.reshape(image.shape[0], image.shape[1], 3)
print('PSNR_med:', compare_psnr(image, image_med))
pylab.imshow(image_med);

# Classification models

## Binary Models

#### SGD

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss='log')
sgd.fit(X, y)

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X, y)

## Binary to Multy

#### All-vs-All

In [None]:
from sklearn.multiclass import OneVsOneClassifier

ovo_clf = OneVsOneClassifier(SGDClassifier())
ovo_clf.fit(X, y)
ovo_clf.predict(X)

#### One-vs-All

In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr_clf = OneVsRestClassifier(SGDClassifier())
ovr_clf.fit(X, y)
ovr_clf.predict(X)

## Multy Models

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

clf_tree = DecisionTreeClassifier(criterion='entropy', max_depth=3)
clf_tree.fit(X, y)

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(100, 50))
plot_tree(clf_tree, fontsize=80, feature_names=list(X_train), filled=True);

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_forest = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=5)
clf_forest.fit(X, y)
clf_forest.predict(X)
clf_forest.predict_proba(X)

#### k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)

## Polynomial features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

## Optimisation of C (reg. parameter)

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

c_values = np.logspace(-2, 3, 500)

logit_searcher = LogisticRegressionCV(Cs=c_values, cv=skf, verbose=1, n_jobs=-1)
logit_searcher.fit(X_poly, y)

# Cross-validation

### Cross-val score

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()

### Cross-val predict

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(model, X_train, y_train, cv=5)

# Quality metrics

$accuracy = \frac{TP+TN}{TP+TN+FP+FN}$

$precision = \frac{TP}{TP+FP}$

$recall = \frac{TP}{TP+FN}$

$F1 = \frac{2}{\frac{1}{precision}+\frac{1}{recall}}= 2*\frac{precision * recall}{precision + recall} = \frac{TP}{TP+\frac{FN+FP}{2}}$

### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_pred)

In [86]:
import numpy as np
from statsmodels.iolib.table import SimpleTable
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def quality_metrics(y, y_pred):
    mv = []
    mv.append(['Accuracy', round(accuracy_score(y, y_pred), 2)])
    mv.append(['Precision', round(precision_score(y, y_pred), 2)])
    mv.append(['Recall', round(recall_score(y, y_pred), 2)])
    mv.append(['F1', round(f1_score(y, y_pred), 2)])
    
    # Metrics
    print(SimpleTable(mv, ['Metric', 'Value']))
    
    # Confusion matrix
    cm = confusion_matrix(y, y_pred)
    cmp = cm*100/cm.sum()
    cmp = np.round_(cmp, 2)
    print(SimpleTable(np.append([['Negative_Model','Positive_Model']], cm, axis=0).T, 
                      ['Amount','Negative_Real','Positive_Real']))
    print(SimpleTable(np.append([['Negative_Model','Positive_Model']], cmp, axis=0).T, 
                      ['Percent','Negative_Real','Positive_Real']))

In [87]:
y = [1,1,1,0,0]
y_pred = [1,1,1,0,0]

quality_metrics(y,y_pred)

  Metric  Value
---------------
 Accuracy   1.0
Precision   1.0
   Recall   1.0
       F1   1.0
---------------
    Amount     Negative_Real Positive_Real
------------------------------------------
Positive_Model             2             0
Negative_Model             0             3
------------------------------------------
   Percent     Negative_Real Positive_Real
------------------------------------------
Positive_Model          40.0           0.0
Negative_Model           0.0          60.0
------------------------------------------


# ROC & PRC

In [63]:
from sklearn.metrics import roc_curve, precision_recall_curve, auc

def plot_roc_curve(model, X, y):
    y_pred = model.predict_proba(X)[:,1]
    
    sns.set(font_scale=1.5)
    sns.set_color_codes("muted")

    plt.figure(figsize=(10, 8))
    fpr, tpr, thresholds = roc_curve(y, y_pred, pos_label=1)
    
    plt.plot(fpr, tpr, lw=2, label='ROC curve ')
    plt.plot([0, 1], [0, 1])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC-AUC = {}'.format(round(auc(fpr, tpr), 2)))
    plt.show()
    
def plot_precision_recall_curve(model, X, y):
    y_pred = model.predict_proba(X)[:,1]
    
    sns.set(font_scale=1.5)
    sns.set_color_codes("muted")

    plt.figure(figsize=(10, 8))
    precisions, recalls, thresholds = precision_recall_curve(y, y_pred, pos_label=1)
    
    plt.plot(recalls, precisions, lw=2, label='PRC curve ')
    plt.plot([0, 1], [0, 1])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.title('PRC-AUC = {}'.format(round(auc(recalls, precisions), 2))))
    plt.show()

# Neural Networks

## keras

In [None]:
from keras.models import Model, Sequential
from keras.layers import Input, Dense, LSTM, GRU, Dropout
from keras.utils import np_utils, plot_model

In [None]:
def model_RNN(X_train, y_train, X_test, y_test, 
              batch_size=100, num_epochs=10, 
              class_w=None,
              continue_train=False,
              model=0,
              plot_m=False, test_res=True, roc_plot=True):
    '''
    create and fit RNN model
    '''
    
    # New model (continue_train=False) or continue training (continue_train=True, model=model)
    if continue_train == False:
        # Layers
        inp = Input(shape=(X_train.shape[1], 1))
        hidden_0 = LSTM(256, return_sequences=True)(inp)
        hidden_1 = LSTM(128, return_sequences=True)(hidden_0)
        hidden_2 = LSTM(64, return_sequences=True)(hidden_1)
        hidden_3 = LSTM(32, return_sequences=False)(hidden_2)
        hidden_4 = Dense(32, activation='relu')(hidden_3)
        drop_1 = Dropout(0.5)(hidden_4)
        hidden_5 = Dense(16, activation='relu')(drop_1)
        drop_2 = Dropout(0.5)(hidden_5)
        out = Dense(1, activation='sigmoid')(drop_2)
        model = Model(inputs=inp, outputs=out)

        # Compile
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['binary_accuracy'])
    else:
        model = model
    
    # Plot model
    if plot_m == True:
        plot_model(model, show_shapes=True)
        plt.show()
        
    # Fit
    if class_w == None:
        class_w = pd.Series(y_train.flatten()).value_counts()
        class_w = y_train.shape[0] / class_w
        class_w = class_w.to_list()

    model.fit(X_train, 
              y_train,
              class_weight=class_w,
              batch_size=batch_size, 
              epochs=num_epochs,
              verbose=1, 
              validation_split=0.1)
    
    # Test score
    if test_res == True:
        y_pred = model.predict(X_test)
        y_pred = (y_pred[:,0] > 0.5).astype('int8')
        quality_metrics(y_test, y_pred)
        
    # ROC
    if roc_plot == True:
        plot_roc_curve(model, X_test, y_test)
    
    return model

## pytorch