# Data Cleaning Function
#### def data_cleaning (dataframe,replace_missing_value)
#### return dataframe

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

def data_cleaning(df,replace_missing_value):
    # return dataframe thru data cleaning process

  def basic_data_info(df):
    # check dataset overall information
    print('\nData Cleaning Report - Basic Data Informations\n')
    print(df.info())

    print('\nData Cleaning Report - Summary of total "NAN" rows\n')
    # identify amount of  "NAN" row
    print(df.isna().sum())



  def search_remove_individual_value_features(df):

    column_name = df.columns
    remove_single_value_features_list=[]

    for name in column_name:
      if len(df[name].unique())==1:
        remove_single_value_features_list=np.append(name,remove_single_value_features_list)
      elif len(df[name].unique())==2 and np.nan in(df[name].unique()):
        remove_single_value_features_list=np.append(name,remove_single_value_features_list)

    if len(remove_single_value_features_list)>=1:
      print('\nData Cleaning Report - Individual Value feature removed : ',remove_single_value_features_list)
      df.drop(remove_single_value_features_list,axis=1,inplace=True)
    else:
      print("\nData Cleaning Report - No Individual Value feature Found !!\n")
      
    return df


  def replace_missing_data(df,replace_missing_value):
    # user can define what value to fill in to missing data
    # "nan" and "blank space" consider missing data

    column_list= df.columns
    missing_EmptySpace_data_dict_list={} # Store "Empty Space" row in dictionary by column name
    missing_NAN_data_dict_list={} # # Store "nan" row in dictionary by column name

    for column in column_list:
        if len(df.loc[df[column]==' '])!= 0 :
            missing_EmptySpace_data_dict_list[column]=df.loc[df[column]==' '].index # index 
        elif len(df.loc[df[column]==np.nan])!=0:
            missing_NAN_data_dict_list[column]=df.loc[df[column]==np.nan].index
              
    ## replace missing data function
    if len(missing_EmptySpace_data_dict_list)!=0 : # check empty_space_column dictionary is empty or not
        
        print('\nData Cleaning Report - Have Missing "Empty Space" Data :\n',missing_EmptySpace_data_dict_list)
        
        for key in missing_EmptySpace_data_dict_list.keys():
            df.loc[missing_EmptySpace_data_dict_list[key],key]=replace_missing_value # df.loc[idx list, column name] = 0
            
    elif len(missing_NAN_data_dict_list)!=0 : # check empty_space_column dictionary is empty or not:
        
        print('\nData Cleaning Report - Have Missing "NaN" Data :\n',missing_NAN_data_dict_list)
        
        for key in missing_NAN_data_dict_list.keys():
            df.loc[missing_NAN_data_dict_list[key],key]=replace_missing_value # df.loc[idx list, column name] = 0
         
    else:
      print('\nData Cleaning Report - No Missing Data or "Nan" row Found !!\n')

    return df


  def blank_space_repalcement(df):
    # Dataframe Column Blank Space Replacement
    df.columns=df.columns.str.replace(' ','_')
    df.replace(' ','_',regex=True,inplace=True)

    return df

  basic_data_info(df)
  df=search_remove_individual_value_features(df)
  df=replace_missing_data(df,replace_missing_value)
  df=blank_space_repalcement(df)

  return df

In [2]:
def imbalance_data_check(df,label_name):
    # SuitabLe for categorical class label [both ordinal or nominal data]
    
    
    
    ## Frequency Table of Label data ##
#     print('\n Imbalance Data Check - Label vs Features Table \n',df.groupby([label_name]).count(),'\n')
    
    freq_table=df.groupby(['Churn_Value']).size().reset_index(name='Count')

    print('\n Imbalance Data Check - Frequency Table of Label Data :\n',freq_table,'\n')
    
    
    
    ## % of label's class distribution data Summary ##
    summary={}

    unique_class=freq_table[label_name].unique()
    total_count=freq_table['Count'].sum()
    
    for i in range(len(unique_class)):
        summary[unique_class[i]]= [(freq_table['Count'][i]/total_count)*100] 
        
    summary_df=pd.DataFrame(data=summary)
    
    print(f'Imbalance Data Check - Label Data Distribution % :\n{summary_df}')

          
          
          
    ##  Plot Figure of Label class data distribution ##
    print(f'\nImbalance Data Check - Data Distribution % Summary Plot :\n') # Imbalance data check summary
    
    sns.set_theme(style="whitegrid")
    ax=sns.barplot(x=label_name,y="Count", data=freq_table)
    
    for i in unique_class:
        num=round(summary[i][0],2)
#         value=str(num)+'%'
        ax.text(i,num,round(num,2), color='black', ha="center")

    return

In [None]:
def numeric_histogram_plot_for_classification_model(df,label_feature,save_photo):
    # label_feature = input label feature name
    # df = input dataframe to be process
    # save_photo = True ( auto saved), Default=="False"
    
    # sns.displot(daframe,x="column name to plot on x-axis",hue=label_feature, element="step")
    # subplot link : https://datavizpyr.com/seaborn-join-two-plots-with-shared-y-axis/
    
    
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    plot_columns=list(df.select_dtypes(include=['int32','int64','float64','float32']).columns)
    
    if label_feature in(plot_columns):
        plot_columns.remove(label_feature)
    
    for i,column in zip( range(len(plot_columns)), plot_columns ):
        plt.figure(i)
        sns_plot=sns.displot(data=df,x=column,hue=label_feature, element="step",kde=True)
        plt.title(column+" - Displot Plot")
        
#         sns.histplot(df, x=column,hue=label_feature, element="step", kde=True)
#         plt.title(column+" - Histogram Plot")
#         plt.show()
        
        if save_photo==True:
            sns_plot.savefig(column+".png")     # auto Saved figure file 
            
    return
