<center><font size="7">Modules et fonctions</font></center>
<div align="right"><font size="4"><i>par Jean Vallée

<hr color="green">

# Import de modules

In [1]:
! pip --quiet install pandas matplotlib scikit-learn imblearn seaborn 
! pip --quiet install xgboost lightgbm catboost mlflow evidently sweetviz

In [2]:
! pip --quiet install -U imbalanced-learn

In [3]:
import pandas as pd                                    # data manipulation
import matplotlib.pyplot as plt                        # graph visualisation
import numpy as np                                     # numerical arrays
import glob                                            # access to local file system
import sys                                             # global variables
from sklearn.model_selection import train_test_split   # Data partitioning
from sklearn.preprocessing import StandardScaler       # normalizes features : median=0 & variance=1
from sklearn.metrics import confusion_matrix, \
  r2_score, accuracy_score, precision_score, f1_score  # Scorers
import subprocess                                      # run multi-line bash script
import mlflow                                          # track model optimisation

## Modèles

In [4]:
from sklearn.dummy      import DummyClassifier
from xgboost            import XGBClassifier
from lightgbm           import LGBMClassifier
from catboost import CatBoostClassifier

In [5]:
# sudo apt-get install libgomp1 # run on VM for LGBM

# Variables de configuration

In [6]:
import json
with open('../config.json') as file_object:
    dict_config = json.load(file_object)

# Déclaration des fonctions

## list_functions()

In [7]:
import types
def list_functions(search_keyword='') :
  li_outs = [str(f) for f in globals().values() if type(f) == types.FunctionType]
  li_outs = [s.replace('<function ', '') for s in li_outs]
  li_outs = [s.split(' at ')[0] for s in li_outs]
  li_outs.sort()
  if len(search_keyword) > 0 : li_outs = [s for s in li_outs if search_keyword.lower() in s.lower()]
  return li_outs

## Divers

### format_1000(n)
Appélée par get_size()

In [8]:
def format_1000(value_in) :
    ''' adds space as thousand separator to a number '''
    return '{:,}'.format(int(value_in)).replace(',', ' ')

### get_size(df)

In [9]:
def get_size(df_in, name_in='DataFrame') :
    ''' returns number of rows & columns of a Pandas '''
    if len(df_in.shape) == 1 : (nb_rows,), nb_cols = df_in.shape, 1
    else :                     nb_rows, nb_cols = df_in.shape
    print(name_in, 'has', format_1000(nb_rows), 'rows and', nb_cols, 'columns')

# count_distinct(df)
Count distinct values per column

In [10]:
def count_distinct(df_in):
    ''' returns a df with counts of distinct values per column of the input df '''
    tuple_col_count = [(col, len(df_in[col].unique()) ) for col in df_in.columns]
    df_out = pd.DataFrame(tuple_col_count, columns=['column_name', 'nb_distinct_values'])
    return df_out  # this function returns an intermediary df and shouldn't be styled

### substract_lists(li, li)

In [11]:
def substract_lists(li_A, li_B) : return [item for item in li_A if item not in li_B]

### get_1_type_cols_list(df, str)

In [12]:
def get_1_type_cols_list(df_in, type_in) :
    ''' Lists all columns in a Pandas of a given type '''
    ser_cols_types = df_in.dtypes
    return list(ser_cols_types[ser_cols_types==type_in].index)

### append_1_row(df, li)
Appélée par get_NDU_cols()

In [13]:
def append_1_row(df_in, li_values_in) :
    ''' Appends a row as a list of values to a Pandas '''
    next_idx = len(df_in)
    li_cols = df_in.columns
    df_1_liner = pd.DataFrame(columns=li_cols, index=[next_idx])
    df_1_liner.loc[next_idx] = li_values_in
    return pd.concat([df_in, df_1_liner])
#append_1_row(df_temp, ['col_name', 'object', 250, 100, 5]) # sample call

### get_NDU_cols(df)

In [14]:
def get_1_col_type        (df_in, col_name) : return df_in[col_name].dtypes
def get_1_col_not_null_nb (df_in, col_name) : return df_in[col_name].count()
def get_1_col_distinct_nb (df_in, col_name) : return len(df_in[col_name].drop_duplicates())
def get_1_col_unique_nb   (df_in, col_name) : return df_in[col_name].nunique()

def get_NDU_cols(df_in) :
    ''' Gets a DF with counts per column of Null, Duplicates and Unique values '''
    df_out = pd.DataFrame(columns = ['column_name', 'type', 'nb_not_null', '%_not_null', 'nb_distinct', '%_distinct', 'unique'])
    for col_i in df_in.columns :
        nb_records   = len(df_in)
        col_type     = get_1_col_type        (df_in, col_i)
        nb_not_null  = get_1_col_not_null_nb (df_in, col_i)
        pct_not_null = round(nb_not_null / nb_records * 100, 2)
        nb_distinct  = get_1_col_distinct_nb (df_in, col_i)
        if nb_not_null > 0 : pct_distinct = round(nb_distinct / nb_not_null * 100, 2)
        nb_unique    = get_1_col_unique_nb   (df_in, col_i)

        df_out = append_1_row(df_out, [col_i, col_type, nb_not_null, pct_not_null, nb_distinct, pct_distinct, nb_unique])
    return df_out

### get_categories(df)

In [15]:
def get_categories(df_in):
    ''' Returns a df with distinct values per column of the input df '''
    tuple_col_cats = [(col, df_in[col].unique()) for col in df_in.columns]
    df_out = pd.DataFrame(tuple_col_cats, columns=['column_name', 'distinct_values'])
    return df_out  # this function returns an intermediary df and shouldn't be styled

### get_nb_outliers (df, col_name)

In [16]:
def get_nb_outliers (df_in, column_name, threshold=100, verbose=False):
    ''' Calculates the number of outliers in a specified column of a DataFrame '''
    df_quantile = df_in[column_name].quantile([0.25, 0.75])
    q1, q3 = df_quantile[0.25], df_quantile[0.75]
    iqr = q3 -q1
    return len(df_in[ (df_in[column_name] < q1 - 1.5 * iqr) | \
                      (df_in[column_name] > q3 + 1.5 * iqr) ])

# encode_category_cols(df, col_name)
Encode a category column into N columns

In [17]:
df_sample = pd.DataFrame(
        { 'field_1' : ['aaaaa', 'bbbbb', 'ccccc', 'ddddd', 'eeeee'],
          'field_2' : [ 111111,  222222,  333333,  444444,  555555],
          'field_3' : ['cat_1', 'cat_2', 'cat_3', 'cat_2', 'cat_1']  } )
df_sample

Unnamed: 0,field_1,field_2,field_3
0,aaaaa,111111,cat_1
1,bbbbb,222222,cat_2
2,ccccc,333333,cat_3
3,ddddd,444444,cat_2
4,eeeee,555555,cat_1


In [18]:
def encode_category_cols(df_in, cat_field) :
  ''' Transforms 1 column of a Pandas in N columns. 1 new column per unique value'''
  li_cats = list(df_in[cat_field].unique())
  for cat_i in li_cats :
    df_in[cat_i] = 0
    df_in.loc[df_in[cat_field] == cat_i, cat_i] = 1
  return df_in.drop(cat_field, axis='columns')
encode_category_cols(df_sample, 'field_3')

Unnamed: 0,field_1,field_2,cat_1,cat_2,cat_3
0,aaaaa,111111,1,0,0
1,bbbbb,222222,0,1,0
2,ccccc,333333,0,0,1
3,ddddd,444444,0,1,0
4,eeeee,555555,1,0,0


## mask_upper_triangle(df)
Pour des matrices de corrélation

In [19]:
def mask_upper_triangle(df_in) :
    ''' Fills a diagonal and the upper half with NaNs '''
    mask = np.zeros_like(df_in, dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    df_in[mask] = np.nan
    return df_in

## Graphes

### pie_cols_ratios(df, li)

In [20]:
def pie_cols_ratios(df_in, li_labels=['Filled', 'Empty'], shown_value='null', title='', nb_pies_max=5):
    ''' Plots N puie charts of NaN or Zero-values in all N columns of a Pandas '''
    # Configure chart's display
    nb_cols = min(nb_pies_max, len(df_in.columns))
    plt.figure(figsize=(3 * nb_cols, 3), facecolor='lightblue')
    plt.suptitle(title, color='blue', y=0.1, fontsize=12, fontweight='bold')

    # Displays a Pie in a SubPlot per DF's column
    for idx, col_name in enumerate(df_in.columns[:nb_pies_max]) :

        # Get list of 2 values
        nb_A_u_B = len(df_in)
        if shown_value == 'null'   : nb_B = len(df_in[df_in[col_name].isna()==True])
        elif shown_value == 'zero' : nb_B = len(df_in[df_in[col_name] == 0        ])
        nb_A = nb_A_u_B - nb_B
        li_values = [nb_A, nb_B]

        # Get list of 2 labels & colors
        li_legend = [   li_labels[0] + ' ' + format_1000(nb_A),
                        li_labels[1] + ' ' + format_1000(nb_B) ]
        li_colors = ['mediumseagreen', 'tomato']

        # Render titled chart in a SubPlot
        plt.subplot(1, nb_cols, idx + 1)
        plt.title(col_name.title(), fontsize=10, y=.9)
        plt.pie(    li_values,
                    #labels=li_labels,
                    colors=li_colors,
                    wedgeprops = { 'linewidth' : 3, 'edgecolor' : 'white' },  # slice border
                    autopct='%1.0f%%',       # Percent calculation
                    pctdistance=.5,
                    labeldistance=1.0,
                    textprops={'fontsize': 12}
               )
        plt.legend(li_legend, loc='lower left', fontsize=12)

    plt.show()

### barh_cols_ratios(df)

In [21]:
def barh_cols_ratios(df_in, nb_cols_max=6) :
    ''' Plots 2 horizontal-bar charts with distinct & unique values of a Pandas '''
    # Limit the nb of columns shown on plot
    nb_cols = min(nb_cols_max, len(df_in.columns))
    df_in = df_in.iloc[:, :nb_cols]

    for quality in ['distinct', 'unique'] :
        plt.figure(figsize=(3 * nb_cols, 0.5), facecolor='lightblue')
        plt.suptitle((quality + ' values').title(), color='blue', y=-.1, fontsize=12, fontweight='bold')
        for idx, col_name in enumerate(df_in.columns) :
            nb_A_u_B = len(df_in)
            if quality == 'distinct' :
                nb_A = len(df_in[col_name].drop_duplicates())
                color_A = 'yellow'
            elif quality == 'unique' :
                nb_A = df_in[col_name].nunique()
                color_A = 'gold'
            else :
                nb_A = 0
            nb_B = nb_A_u_B - nb_A
            color_B = 'silver'

            plt.subplot(1, nb_cols, idx + 1)
            plt.title(col_name.title())
            plt.axis('off')

            plt.barh(0, nb_A, color=color_A, edgecolor='black')
            label_A = str(int(nb_A / nb_A_u_B * 100)) + '%\n' +  str(nb_A) + ' ' + quality
            plt.text(x=0, y=-0.3, s=label_A)

            plt.barh(0, nb_B, left=nb_A, color=color_B, edgecolor='black')
            label_B = str(int(nb_B / nb_A_u_B * 100)) + '%\n' +  str(nb_B)
            if nb_B > 0 : plt.text(x=(nb_A+nb_B)*0.8, y=-0.3, s=label_B)
        plt.show()

## plot_clusters_xy(df, np_labels, str_title, nb_rows)

In [22]:
def plot_clusters_xy(df_in, np_labels, title='', nb_rows=2, np_legend_labels='', opacity=.2) :
  ''' Plots clusters in a 2D-scatter chart. The Pandas contains clusters and X-Y coordinates '''
  if np_legend_labels == '' : np_legend_labels = np.unique(np_labels)
  li_tup_idx = list(itertools.combinations(df_in.columns, 2))
  nb_cols = int((len(li_tup_idx)-1)/nb_rows) + 1
  fig, ax = plt.subplots(nb_rows, nb_cols, figsize=(20, 3*nb_rows), edgecolor='black', linewidth=2)
  for idx, (str_x, str_y) in enumerate(li_tup_idx) :
    plt.subplot(nb_rows, nb_cols, idx+1)
    scatter = plt.scatter(df_in[str_x], df_in[str_y], c=np_labels, s=3, alpha=opacity, cmap='brg')
    plt.xlabel(str_x), plt.ylabel(str_y)
  legend  = plt.legend(scatter.legend_elements()[0], np_legend_labels, title='cluster', markerscale=2)
  title   = fig.suptitle(title) #, loc='topleft')
  plt.show()

## plot_correlation_circle(axis, np_in, li_variables, li_colors)

In [23]:
def plot_correlation_circle(ax, np_in, li_variables, li_colors, idx_pc_A=0, idx_pc_B=1, x_min=-1, x_max=1, y_min=-1, y_max=1, title_prefix='') :
    ''' Used in PCA to show correlation between attributes and principal components '''
    x_origin, y_origin = 0, 0
    for idx_feature in range(0, len(np_in[idx_pc_B])) :
        x_arrow_head, y_arrow_head = np_in[idx_pc_A, idx_feature], np_in[idx_pc_B, idx_feature]
        if  (x_arrow_head>x_min) & (x_arrow_head<x_max) & (y_arrow_head>y_min) & (y_arrow_head<y_max) :
            # plot 1 arrow
            ax.arrow(x_origin, y_origin, x_arrow_head, y_arrow_head,
                     head_width=0.05, head_length=0.03, width=0.02, # arrow's dimensions
                     color=li_colors[idx_feature], alpha=.3) # alpha: opacity
            # plot 1 arrow's head
            arrow_label = li_variables[idx_feature]
            x_label, y_label = x_arrow_head + (np.sign(x_arrow_head))*0.03, y_arrow_head
            ax.text(x_label, y_label, arrow_label, size=10)

    # Axis x,y
    ax.plot([-1, 1], [0, 0], color='black', ls='dotted')
    ax.plot([0, 0], [-1, 1], color='black', ls='dotted')
    ax.set_xlim([x_min, x_max]), ax.set_ylim([y_min, y_max])
    if (x_min==-1) & (x_max==1) & (y_min==-1) & (y_max==1) : ax.axis('equal') # sets proportion x,y
    ax.set_xticks([]), ax.set_yticks([])
    ax.set_xlabel('PC_{} ({}%)'.format(idx_pc_A+1, round(100*np_variance_pct[idx_pc_A],1))) # Axis labels & variance percent
    ax.set_ylabel('PC_{} ({}%)'.format(idx_pc_B+1, round(100*np_variance_pct[idx_pc_B],1)))

    angle = np.linspace(0, 2 * np.pi, 100)   # ~float range(0, 2pi) with 100 steps
    ax.plot(np.cos(angle), np.sin(angle))  # plots circle
    ax.set_title(title_prefix + 'Correlation Circle (PC_{} et PC_{})'.format(idx_pc_A+1, idx_pc_B+1), size=9)

## plot_2d_umap()

In [24]:
def plot_2d_umap(plt, nb_rows, nb_cols, idx, np_tuple_xy, df_target, li_labels, title='Photo Features 2D (embedded via UMAP)', size=10) :
  nb_labels = len(li_labels)
  # Scatter plot
  plt.subplot(nb_rows, nb_cols, idx + 1)
  plt.scatter(*np_tuple_xy.T, s=size, c=df_target, cmap='Spectral', alpha=.8, edgecolors='none')
  plt.setp(ax, xticks=[], yticks=[])
  # Legend
  color_bar = plt.colorbar(boundaries=np.arange(nb_labels+1)-0.5)
  color_bar.set_ticks(range(nb_labels))
  color_bar.set_ticklabels(li_labels)
  plt.title(title)

## plot_distribution(target)

In [25]:
from collections import Counter
def plot_distribution(target) : # target: array or 1-column Pandas
    count_values = Counter(target)
    if type(target) is np.ndarray : target = pd.DataFrame(target)
    plt.figure(figsize=(2, 2))   
    ax = plt.pie(target.value_counts(), autopct='%1.0f%%', textprops={'color': 'white'})
    plt.show()
    return count_values

## show_boxplot(df_single_col)

In [26]:
def show_boxplot(df_single_col) :
  flierprops = dict(marker='.', markerfacecolor='darkgreen', markersize=7, markeredgecolor='none', alpha=.2)
  df_single_col.plot.box(figsize=(10, .5), vert=False, flierprops=flierprops)
  plt.yticks(rotation=90), plt.show()

# Files

## unzip_to_dir(zip_file, target_dir)

In [27]:
def unzip_to_dir(zip_file, target_dir) :
  command = f'''
    if [ ~ -f "{zip_file}" ]  ; then echo "{zip_file} not found in current folder" ; exit ; fi
    ls -l "{zip_file}"
    echo "Unzipping ZIP file to {target_dir} ..."
    UNZIP_IS_AVAILABLE=`which unzip`
    if [ ~ UNZIP_IS_AVAILABLE ] ; then echo "unzip unavailable" ; exit ; fi
    unzip {zip_file} -d "{target_dir}" | grep -i "error"
    for EXT in "jpg" "csv" "txt" ; do
        NB_FILES=`ls "{target_dir}" | grep "$EXT" | wc -l`
        echo "$NB_FILES $EXT files"
    done
    else 
    fi
  '''
  result = subprocess.run(command, shell=True, capture_output=True, text=True)
  print(result.stdout)

## wget_file(url, file_extension, dir_target)

In [28]:
def wget_file(url, file_extension, dir_target) :
  suffix = '.' + file_extension
  li_dir_contents = !ls
  if suffix in li_dir_contents[0] :
    print('Target file already downloaded')
  else :
    ! cd $dir_target ; wget "$url"

## get_filenames_in_dir(path, file_extension)

In [29]:
def get_filenames_in_dir(path, file_extension='*') :
  shell_command = f'ls -1 {path}/*.{file_extension}'
  shell_output = subprocess.getoutput(shell_command)
  return shell_output.split()