In [None]:
# Import the library
import matplotlib
matplotlib.use('PDF')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from scipy.stats import gaussian_kde
from scipy import stats
import statsmodels.api as sm
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
original_df = pd.read_csv('/root/data/BitcoinHeistData.csv') 
Processed_df = pd.read_csv('/root/BTC_BalancedData_Inversed_2.csv')

In [None]:
font = {'size'   : 30}

matplotlib.rc('font', **font)

In [None]:
# Function used in main
# The format of pie chart label, display percentage & number together
def PercentageCal(df,HeadNum):
    Percentage = 100*(df/df.sum()).sort_values(ascending=False)
    Select = Percentage.head(HeadNum)
    Rest = pd.Series(Percentage.loc[Percentage.index[HeadNum:]].sum(), index = ['Rest'])
    Combined = pd.concat([Select,Rest])
    Org_Select = df.iloc[HeadNum:]
    Org_Rest = pd.Series(df.iloc[HeadNum:].sum(),index = ['Rest'])
    Org_Combined = pd.concat([Org_Select,Org_Rest])
    return Percentage, Select, Rest, Combined, Org_Combined

#C alculate the percentage
def pct_value(val):
    def calc_pct(pct):
        absolute = int(pct/100.*np.sum(val))
        return "{:.1f}%\n{:d}".format(pct, absolute)
    return calc_pct
# QQ plot function
def normalization_plot(df,ax):
    df.hist(ax= ax, bins=30, density=True, alpha=0.5)
    mu, sigma = stats.norm.fit(df)
    x = np.linspace(df.min(), df.max(), 100)
    pdf = stats.norm.pdf(x, mu, sigma)
    ax.plot(x, pdf, 'r-', linewidth=2)
# Pie plot function
def pie_plot(ax, df, PieHeadNum=5, HeadSelect=False, Plotstitle=''):
    Percentage, Select, Rest, Combined,Org_Combined = PercentageCal(df, PieHeadNum)
    df = df
    pct_label = df
    if HeadSelect == True: 
        df = Combined
        pct_label = Org_Combined
    ax.pie(df, labels=df.index,autopct = pct_value(pct_label))
    ax.set_title(Plotstitle)

In [None]:
# Main Function
# df: the dataset need to be plotted
# Processed: indicate the dataset is processed dataset or original dataset
# FigsizeX, figsizeY: Indicate the figure size of the plot
# Kdedensity: Parameter for Hist plot, control the density function
# ViolinVert: Parameter for Violin plot, control the vert function
# Due to the limit of a pdf page could contain, each kind of plot would have individual pdf file
def ShowBalaPlots(df,tar_label,Processed = False,figsizeX = 50,figsizeY = 50,Kdedensity = True,ViolinVert = False):
    
    tar_df = df[tar_label]

    
    # Acquire M, PA, PR, W data
    if (Processed):
        m_df = df.loc[df['label'] == 0,tar_label]
        pa_df = df.loc[df['label'] == 1,tar_label]
        pr_df = df.loc[df['label'] == 2,tar_label]
        w_df = df.loc[df['label'] == 3,tar_label]
    else:
        m_df = df.loc[df['label'].str.startswith("m"),tar_label]
        pa_df = df.loc[df['label'].str.startswith("pa"),tar_label]
        pr_df = df.loc[df['label'].str.startswith("pr"),tar_label]
        w_df = df.loc[df['label'].str.startswith("w"),tar_label]
    
    tar_df_cnt = tar_df.value_counts()    
    m_df_cnt = m_df.value_counts()
    pa_df_cnt = pa_df.value_counts()
    pr_df_cnt = pr_df.value_counts()
    w_df_cnt = w_df.value_counts()
    print(tar_df_cnt)
    print(m_df_cnt)
    print(pa_df_cnt)
    print(pr_df_cnt)
    print(w_df_cnt)
    
    
    
   # Define your filename outside the loop
    filename = 'original ' + tar_label + ' distribution.pdf'
    if Processed:
        filename = 'preprocessed ' + tar_label + ' distribution.pdf'
    
    fixname = filename

    #Start plot figures
    fig, ax =plt.subplots(1,5,figsize = (70,70))
    
    #Pie Chart
    pie_plot(ax[0], tar_df_cnt,HeadSelect = True, PieHeadNum = 3,Plotstitle='Total '+ tar_label+' Distribution')
    pie_plot(ax[1], m_df_cnt,HeadSelect = True, PieHeadNum = 3,Plotstitle='Montreal '+ tar_label+' Distribution')
    pie_plot(ax[2], pa_df_cnt,HeadSelect = True, PieHeadNum = 2,Plotstitle='Panuda '+ tar_label+' Distribution')
    pie_plot(ax[3], pr_df_cnt,HeadSelect = True, PieHeadNum = 2,Plotstitle='Princeton '+ tar_label+' Distribution')
    pie_plot(ax[4], w_df_cnt,HeadSelect = True, PieHeadNum = 2,Plotstitle='White '+ tar_label+' Distribution')
    filename = fixname
    filename = 'Pie Chart ' + filename
    with PdfPages(filename) as pdf:
        pdf.savefig(fig)
    
    # KDE
    fig, ax =plt.subplots(5,1,figsize = (figsizeX,figsizeY))
    tar_df.plot.kde(ax=ax[0],legend = True,label = 'Total ')
    ax[4].set_title('Total Distribution')
    m_df_cnt.plot.kde(ax=ax[1],legend = True,label = 'Montreal ')
    ax[4].set_title('Montreal Data Distribution')
    pa_df_cnt.plot.kde(ax=ax[2],legend = True,label = 'Panuda ')
    ax[4].set_title('Panuda Data Distribution')
    pr_df_cnt.plot.kde(ax=ax[3],legend = True,label = 'Princeton ')
    ax[4].set_title('Princeton Data Distribution')
    w_df_cnt.plot.kde(ax=ax[4],legend = True,label = 'White ')
    ax[4].set_title('White Data Distribution')
    filename = fixname
    filename = 'KDE Chart ' + filename
    with PdfPages(filename) as pdf:
        pdf.savefig(fig)
    # Hist
    fig, ax =plt.subplots(5,1,figsize = (figsizeX,figsizeY))
    tar_df.plot.hist(ax=ax[0],legend = True,density=Kdedensity,label = 'Total ')
    ax[0].set_title('Total Distribution')
    m_df_cnt.plot.hist(ax=ax[1],legend = True,density=Kdedensity,label = 'Montreal ')
    ax[1].set_title('Montreal Data Distribution')
    pa_df_cnt.plot.hist(ax=ax[2],legend = True,density=Kdedensity,label = 'Panuda ')
    ax[2].set_title('Panuda Data Distribution')
    pr_df_cnt.plot.hist(ax=ax[3],legend = True,density=Kdedensity,label = 'Princeton ')
    ax[3].set_title('Princeton Data Distribution')
    w_df_cnt.plot.hist(ax=ax[4],legend = True,density=Kdedensity,label = 'White ')
    ax[4].set_title('White Data Distribution')
    filename = fixname
    filename = 'Hist Plot ' + filename
    with PdfPages(filename) as pdf:
        pdf.savefig(fig)
    # Violin Plot
    fig, ax =plt.subplots(5,1,figsize = (figsizeX,figsizeY))
    ax[0].violinplot(tar_df,vert = ViolinVert)
    ax[0].set_title('Total Distribution')
    ax[1].violinplot(m_df,vert = ViolinVert)
    ax[1].set_title('Montreal Data Distribution')
    ax[2].violinplot(pa_df,vert = ViolinVert)
    ax[2].set_title('Panuda Data Distribution')
    ax[3].violinplot(pr_df,vert = ViolinVert)
    ax[3].set_title('Princeton Data Distribution')
    ax[4].violinplot(w_df,vert = ViolinVert)
    ax[4].set_title('White Data Distribution')
    filename = fixname
    filename = 'Violin Plot ' + filename
    with PdfPages(filename) as pdf:
        pdf.savefig(fig)
        
    # Q-Q Plot
    fig, ax =plt.subplots(5,1,figsize = (figsizeX,figsizeY))
    normalization_plot(tar_df_cnt,ax[0])
    ax[0].set_title('Total Distribution')
    normalization_plot(m_df_cnt,ax[1])
    ax[1].set_title('Total Distribution')
    normalization_plot(pa_df_cnt,ax[2])
    ax[2].set_title('Total Distribution')
    normalization_plot(pr_df_cnt,ax[3])
    ax[3].set_title('Total Distribution')
    normalization_plot(w_df_cnt,ax[4])
    ax[4].set_title('Total Distribution')
    filename = fixname
    filename = 'QQ Plot ' + filename
    with PdfPages(filename) as pdf:
        pdf.savefig(fig)

In [None]:
# Example
ShowBalaPlots(Processed_df,'transaction_frequency',Processed = True)