In [None]:
import io
from IPython.display import display
# from bbmagic import Hdfs

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.patches as mpatches
from matplotlib.ticker import PercentFormatter
import seaborn as sns

import scipy.stats as stats

In [None]:
# First, let's have a glimpse of how many observations and variables we have.
df = pd.read_csv('df.csv')
print('Number of observations:', len(df)) # XX.XXX
df.head(5)

In [None]:
# Verify data types.
df.info()

In [None]:
# Correct some data types.
df['col_1'] = df['col_1'].astype('str')
df['col_2'] = df['col_2'].astype('float')
df.info()

In [None]:
# Verify proportion of nulls per column.
percentage_null_features = pd.DataFrame(data = df.isnull().sum() / len(df))
percentage_null_features.columns = ['Proportion of nulls in column']
display(percentage_null_features.sort_values(by = 'Proportion of nulls in column', ascending = False).head(60))

In [None]:
# Fill nulls
df['col_1'].fillna(0, inplace = True) # Good approach with tree models when we want to signal that missings are a valid information. Can be done before spliting between train and test.
df['col_2'].fillna(df['col_2'].median(), inplace = True) # Useful when we want to minimize the impact of missings in the model. Must be done after spliting between train and test.
df = df.fillna(df.median()) # Useful when we want to minimize the impact of missings in the model. Must be done after spliting between train and test.

In [None]:
# Verify some basic statistic.
display(df[['target', 'col_1', 'col_2', 'col_3', 'col_4',
       'col_5']].describe())

print()
display(df[['col_6', 'col_7', 'col_8', 'col_9',
       'col_10']].describe())

In [None]:
# Verify how many observations per month we have.
df_month = df.groupby(['year_month']).size().reset_index(name='count')

# Verifica quantidade de obsevações por ano mês.
title = 'Observations per month.'
plt.rcParams.update(plt.rcParamsDefault)
plt.rc('xtick', labelsize = 6)    # fontsize of the tick labels
plt.rc('ytick', labelsize = 6)    # fontsize of the tick labels
df_month[['year_month', 'count']].plot(x = 'year_month', y = 'count', kind = 'line', legend = None, figsize = (10, 6))

plt.xticks(range(len(df_month['year_month'].values)), df_month['year_month'].values, size = 'small', rotation='vertical')

plt.title(title, fontsize = 10)
plt.show()
plt.close()

In [None]:
# Verify how many observations per month we have when target = 1.
# Quantidade de observações por mês
df_month = df[np.in1d(df['target'], 1)].groupby(['year_month']).size().reset_index(name='count')

# Verifica quantidade de obsevações por ano mês.
title = 'Observations per month when target = 1.'
plt.rcParams.update(plt.rcParamsDefault)
plt.rc('xtick', labelsize = 6)    # fontsize of the tick labels
plt.rc('ytick', labelsize = 6)    # fontsize of the tick labels
df_month[['year_month', 'count']].plot(x = 'year_month', y = 'count', kind = 'line', legend = None, figsize = (10, 6))

plt.xticks(range(len(df_month['year_month'].values)), df_month['year_month'].values, size = 'small', rotation='vertical')

plt.title(title, fontsize = 10)
plt.show()
plt.close()

In [None]:
# Verify the proportion of each target per month
df_month = df[['year_month', 'target']].groupby(['year_month']).mean().reset_index()

title = '% target per month.'
plt.rcParams.update(plt.rcParamsDefault)
plt.rc('xtick', labelsize = 6)    # fontsize of the tick labels
plt.rc('ytick', labelsize = 6)    # fontsize of the tick labels
df_month.plot(x = 'year_month', y = 'target', kind = 'line', legend = None, figsize = (10, 6))
plt.xticks(range(len(df_month['year_month'].values)), df_month['year_month'].values, size = 'small', rotation='vertical')
plt.title(title, fontsize = 10)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.show()
plt.close()

In [None]:
# Chi-squared for categorical variables.
def chi_squared_test(df, variable, target):
    # create contingency table
    data_crosstab = pd.crosstab(df[variable],
                                df[target],
                               margins=True, margins_name="Total")

    # significance level
    alpha = 0.05

    # Calcualtion of Chisquare
    chi_square = 0
    rows = df[variable].unique()
    columns = df[target].unique()
    for i in columns:
        for j in rows:
            O = data_crosstab[i][j]
            E = data_crosstab[i]['Total'] * data_crosstab['Total'][j] / data_crosstab['Total']['Total']
            chi_square += (O-E)**2/E

    # The p-value approach
    print("Approach 1: The p-value approach to hypothesis testing in the decision rule")
    p_value = 1 - stats.chi2.cdf(chi_square, (len(rows)-1)*(len(columns)-1))
    conclusion = "Failed to reject the null hypothesis. Variable and target are independent"
    if p_value <= alpha:
        conclusion = "Null Hypothesis is rejected. Variable and target are dependent!"

    print("chisquare-score is:", chi_square, " and p value is:", p_value)
    print(conclusion)

# Histogram with proportion for continuous variable
def bar_graf(x_limits, top_size_y, df_variable, title, bins = 10):
    plt.rcParams.update(plt.rcParamsDefault)
    color = 'cornflowerblue'
    contorno = None
   
    ax = df_variable.hist(density = False
            , weights = np.ones(len(df_variable)) / len(df_variable)
            , histtype = 'bar', bins = bins, align = 'mid'
            , orientation = 'vertical'
            , color = color, edgecolor = contorno)
   
    plt.grid(b = None)
    plt.title(title, fontsize = 10)
    plt.xlim(x_limits)
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.ylim(top = top_size_y)
    return

# histogram with count and proportion for continuous variable
def bar_graph_categoric_var(df_variable, title, rotation = 0, limite_categorias = 50, alphabetic_order = False):
    plt.rcParams.update(plt.rcParamsDefault)
    color = 'cornflowerblue'
    contorno = None
    if alphabetic_order == True:
        df_bar_graph = pd.DataFrame(df_variable.value_counts()).sort_index().head(limite_categorias)
    else:
        df_bar_graph = pd.DataFrame(df_variable.value_counts()).head(limite_categorias)
    ax = df_bar_graph.plot(kind = 'bar', width = 0.5, rot = rotation
                          , align = 'center', color = color, edgecolor = contorno)
    ax.get_legend().remove()
    total = 0
    for bars in ax.patches:
        total += bars.get_height()
       
    for p in ax.patches:
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy()
        ax.annotate(f'{height/total:.1%}'
                    , (x + width/2, y + 2 + height)
                    , ha = 'center')
        plt.title(title, fontsize = 10)
   
# Boxplot for continuous variables.
def box_plot_graph(df, variable, title, y_lim):
    plt.rcParams.update(plt.rcParamsDefault)
   
    bg_color = "white"
    contorno = 'black'
    color = 'black'
    sns.set_style("darkgrid", {'axes.facecolor': bg_color
                              , 'axes.edgecolor': contorno})
    meanpointprops = dict(color = color, linewidth = 1.5)
   
    ax = sns.boxplot(y = variable, data = df
                    , showmeans = True, meanline = True, meanprops = meanpointprops)
    ax.set_title(title, fontsize = 10)
    ax.set(ylim = y_lim)
    
# Show proportion of categories per month.
def monthly_bar_graph(df, variable):
    title = 'Monthly distribution: ' + variable

    df['count_aux'] = 1
    df_count_mounth = pd.crosstab(index = df['year_month'], columns = df[variable], normalize = "index") 
    df_count_mounth.columns = df_count_mounth.columns.astype(str) 
    df_count_mounth['year_month'] = df_count_mounth.index
    df_count_mounth.reset_index(drop = True, inplace = True)

    plt.rcParams.update(plt.rcParamsDefault)
    plt.rc('xtick', labelsize = 8)    # fontsize of the tick labels
    plt.rc('ytick', labelsize = 8)    # fontsize of the tick labels
    color_1 = 'cornflowerblue'
    color_2 = 'lightblue'
    colormap='tab10'
    contorno = None

    df_count_mounth.plot(x = 'year_month', kind = 'bar', 
                        stacked = True, 
                        colormap = colormap, 
                        figsize = (20, 8))

    for n, x in enumerate([*df_count_mounth.index.values]):
        proportion_cumulative = 0
        for proportion in df_count_mounth.iloc[:,:-1].loc[x]:   
            proportion_cumulative_low = proportion_cumulative
            proportion_cumulative += proportion 
            plt.text(x = n,
                     y = (proportion_cumulative + proportion_cumulative_low)/2,
                     s = f'{np.round(proportion * 100, 1)}%', 
                     color = "black",
                     va = 'center', ha = 'center',
                     fontsize = 8)#,
                     # fontweight = "bold")

    plt.legend(loc="lower left", ncol=2)
    plt.xlabel("Year month")
    plt.ylabel("Proportion")
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.title(title, fontsize = 10)
    plt.show()
    
# Proportion of each target label per category.
def bar_graph_stacked_categoric_var(df_variable, variable, target_variable, title, rotation = 90, limite_categorias = 500):
    plt.rcParams.update(plt.rcParamsDefault)
    plt.rc('xtick', labelsize = 8)    # fontsize of the tick labels
    plt.rc('ytick', labelsize = 8)    # fontsize of the tick labels
    color_1 = 'cornflowerblue'
    color_2 = 'lightblue'
    contorno = None
   
    total = df_variable.groupby(variable).count().reset_index()
    total = total.sort_values(target_variable, ascending = False).head(limite_categorias)
    target = df_variable[df_variable[target_variable] == 1].groupby(variable).count().reset_index()
    
    # Limit the table when there are too many categoris in our variable.
    target_index = []
    for i in range(0, len(target)):
        if i in total.index:
            target_index.append(True)
        else:
            target_index.append(False)
    target = target[target_index]

    target[target_variable] = target[target_variable] / total[target_variable]
    target.sort_values(target_variable, ascending = False, inplace = True)
    total[target_variable] = 1   

    # bar chart 1 -> top bars (group of 'target = 0')
    bar1 = sns.barplot(x = variable,  y = target_variable, data = total, color = color_1)

    # bar chart 2 -> bottom bars (group of 'target = 1')
    bar2 = sns.barplot(x = variable,  y = target_variable, data = target, color = color_2)
    
    # add legend
    top_bar = mpatches.Patch(color = color_1, label = target_variable + ' = No')
    bottom_bar = mpatches.Patch(color = color_2, label = target_variable + ' = Yes')

    bar2.set_xticklabels(bar2.get_xticklabels(), rotation = rotation)
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.legend(handles=[top_bar, bottom_bar], loc = 'lower left', fontsize = 8)
   
    plt.title(title, fontsize = 10)
    
    target.columns = [[variable, '% of Yes']]
    display(target)
    return

# Boxplot for each target.
def box_plot_graph_hue(df, variable, title, y_lim, hue_var = 'target', hue_order = [1, 0]):
    plt.rcParams.update(plt.rcParamsDefault)
   
    bg_color = "white"
    contorno = 'black'
    color = 'black'
    sns.set_style("darkgrid", {'axes.facecolor': bg_color
                              , 'axes.edgecolor': contorno})
    meanpointprops = dict(color = color, linewidth = 1.5)
   
    ax = sns.boxplot(y = variable, data = df, x = hue_var, order = hue_order
                    , showmeans = True, meanline = True, meanprops = meanpointprops)
    ax.set_title(title, fontsize = 10)
    ax.set(ylim = y_lim)
    
    
# Boxplot of a variable per month when target = 1.
def boxplot_target_1_ano_mes(df, variable, target, title):
    df_1 = df[np.in1d(df[target_variable], 1)]

    plt.rcParams.update(plt.rcParamsDefault)
    sns.set(rc = {'figure.figsize': (10, 6)})

    bg_color = "white"
    contorno = 'black'
    color = 'black'
    sns.set_style("darkgrid", {'axes.facecolor': bg_color, 'axes.edgecolor': contorno})

    meanpointprops = dict(color = color, linewidth = 1.5)
    hue_var = 'year_month'
    hue_order = ['202001', '202002', '202003',
        '202004', '202005', '202006', '202007', '202008',
        '202009', '202010', '202011', '202012', '202101',
        '202102', '202103', '202104', '202105', '202106',
        '202107', '202108', '202109', '202110', '202111',
        '202112']
    ax = sns.boxplot(y = variable, data = df_1, x = hue_var, order = hue_order
                    , showmeans = True, meanline = True, meanprops = meanpointprops)
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
    ax.set_title(title, fontsize = 10)

    plt.show()
    plt.close()
    
# Boxplot of a variable per month when target = 0.
def boxplot_target_0_ano_mes(df, variable, target, title):
    df_0 = df[np.in1d(df[target_variable], 0)]

    plt.rcParams.update(plt.rcParamsDefault)
    sns.set(rc = {'figure.figsize': (10, 6)})

    bg_color = "white"
    contorno = 'black'
    color = 'black'
    sns.set_style("darkgrid", {'axes.facecolor': bg_color, 'axes.edgecolor': contorno})

    meanpointprops = dict(color = color, linewidth = 1.5)
    hue_var = 'year_month'
    hue_order = ['202001', '202002', '202003',
        '202004', '202005', '202006', '202007', '202008',
        '202009', '202010', '202011', '202012', '202101',
        '202102', '202103', '202104', '202105', '202106',
        '202107', '202108', '202109', '202110', '202111',
        '202112']
    ax = sns.boxplot(y = variable, data = df_0, x = hue_var, order = hue_order
                    , showmeans = True, meanline = True, meanprops = meanpointprops)
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
    ax.set_title(title, fontsize = 10)

    plt.show()
    plt.close()

In [None]:
# Study the correlation among variables.
def correlation_heatmap(df, title):
    size_x = 10
    size_y = 10
    plt.figure(figsize = (size_x, size_y))
    sns.set(font_scale = 1)
    corr_matrix = df.corr()
    with sns.axes_style('white'):
        ax = sns.heatmap(corr_matrix
                        , linewidth = 0.2
                        , annot = True, fmt = '.1f'
                        , cmap = 'seismic'
                        , vmin = -1, vmax = 1)
    plt.title(title)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.show()
    plt.close()
    
# Drop variables that we discarted in the previously study. 
drop_variables_2 = ['col_1', 'col_7']

df_corr = df.drop(drop_variables_2, axis = 1).copy()

title = 'HeatMap: Correlation among variables.'
correlation_heatmap(df_corr, title)

In [None]:
# Drop some variables because of high correlation.
drop_variaveis_3 = ['col_2', 'col_8']
df_pairplot = df_corr.drop(drop_variaveis_3, axis = 1).copy()
df_pairplot['target_hue'] = df_pairplot['target']  # So we have different colors for each target and a new column of pairplots with the target.

# Pairplot. To many variables may crash Jupyter.
df_pairplot_aux = df_pairplot[['col_3', 'col_4', 'col_5', 'col_6', 'target', 'target_hue']]

plt.figure(figsize = (10, 10))
sns.set(font_scale = 0.8)
plt.rc('xtick', labelsize = 6)    # fontsize of the tick labels
plt.rc('ytick', labelsize = 6)    # fontsize of the tick labels
with sns.axes_style('white'):
    ax = sns.pairplot(df_pairplot_aux, hue = "target_hue", hue_order = [0, 1]
                      , plot_kws = {'alpha': 0.4, 's': 40})
plt.show()
plt.close()