In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, rankdata

# Ch2. Summary Statistics

It is important for presentation of summary statistics represents a trade-off between showing enough results to give the reader and not presenting so much that the reader is overwhelmed.

In this part, show the typical summary statistis tor cross-sectional distribution

The summary statistics procedure consists of two sterps:
  1. For each time period $t$, certain charaacteristics of cross-sectional distribution of the given variable, $X$, are calculated.   
  2. The time series properties of the periodic cross-sectional characteristics are calculated.    
  The most importanr time series properties are the mean.   
  The mean means the average value of the cross-sectional characteristic over time.

# 1.1 Periodic Cross-Sectional Summary Statistics

First step for cross-sectional summary statistics.   

Calculate in one period, $t-n$, $t-(n-1)$, ..., $t$.   

Calculate the mean, standard deviation, skewness, excess kurtosis, minimum, median, maximum, and selected additional percentile of the distribution of values of $X$, where each of these statistics is calculated over all available values of $X$ in time $t$.

In this chapter, we let statistics as:
* $Mean_t$   : mean
* $SD_t$     : standard deviation
* $Skew_t$   : skewneww
* $Kurt_t$   : kutosis
* $Min_t$    : minimum value
* $Median_t$ : median value
* $Max_t$    : maximum value of $X$ in period $t$

Percentile as:
* $P5_t$     : 5th percentile
* $P25_t$    : 25th percentile
* $P75_t$    : 75th percentile
* $P95_t$    : 95th percentile

Extreme values:
* $P1_t$     : 95th percentile
* $P2_t$     : 2th percentile
* $P3_t$     : 3th percentile
* $P4_t$     : 4th percentile

* $P96_t$    : 96th percentile
* $P97_t$    : 97th percentile
* $P98_t$    : 98th percentile
* $P99_t$    : 99th percentile

Observation
* $n_t$      : Valid value of $X$ is available period $t$

In [None]:
# The objectives in analyzing the summary statistics are twofold. First, the summary statistics are intended to give a basic overview of the cross-sectional properties 
# of the variables that will be used in the study. This is useful for understanding the types of entities that comprise the sample. 
# Second, the summary statistics can be used to identify any potential issues that may arise when using these variables in statistical analyses.

First Stage

In [None]:
def cal_cs_stats(df, time_column, value_column, additional_percentiles=False):
    """
    Calculate cross-sectional statistics for each time period, handling NaN values and reporting them.
    
    Args:
        df (pd.DataFrame): The data frame containing the data.
        time_column (str): The name of the column representing time periods.
        value_column (str): The name of the column representing the values of X.
        additional_percentiles (bool or list of float): Additional percentiles to calculate (optional).
    
    Returns:
        pd.DataFrame: A data frame containing the calculated statistics for each time period.
    """
    # Check for NaN values and report them
    nan_report = df[df[value_column].isna()]
    if not nan_report.empty:
        print("NaN values found in the following rows:")
        print(nan_report)
    
    # Define additional percentiles if required
    if additional_percentiles is True:
        additional_percentiles_list = [0.01, 0.02, 0.03, 0.04, 0.96, 0.97, 0.98, 0.99]
    elif isinstance(additional_percentiles, list):
        additional_percentiles_list = additional_percentiles
    else:
        additional_percentiles_list = []
    
    # Function to calculate the required statistics for a given period
    def calc_period_stats(group):
        group_clean = group.dropna(subset=[value_column])
        stats = {
            'Time': group[time_column].iloc[0],  # Time column
            'Mean': group_clean[value_column].mean(),
            'SD': group_clean[value_column].std(),
            'Skew': skew(group_clean[value_column], nan_policy='omit'),
            'Kurt': kurtosis(group_clean[value_column], nan_policy='omit'),
            'Min': group_clean[value_column].min(),
            'Median': group_clean[value_column].median(),
            'Max': group_clean[value_column].max(),
            'P5': group_clean[value_column].quantile(0.05),
            'P25': group_clean[value_column].quantile(0.25),
            'P75': group_clean[value_column].quantile(0.75),
            'P95': group_clean[value_column].quantile(0.95),
            'N': group_clean[value_column].count(),
            'NaN_count': group[value_column].isna().sum()  # Report the number of NaN values
        }
        
        for percentile in additional_percentiles_list:
            stats[f'P{int(percentile*100)}'] = group_clean[value_column].quantile(percentile)
        
        return pd.Series(stats)
    
    # Group the data by the time column and apply the function
    stats_df = df.groupby(time_column, group_keys=False).apply(calc_period_stats).reset_index(drop=True)
    
    return stats_df

Second Stage

caculate the time-series averages of the periodic cross-sectional values.
    $Mean$: time-series average of the values of $Mean_t$ over all periods $t$ in the sample.

Caculate the time-series means of the corss-sectional summary statistics

In [None]:
def cal_ts_mean(stats_df):
    """
    Calculate the time-series averages of the cross-sectional statistics.
    
    Args:
        stats_df (pd.DataFrame): A data frame containing cross-sectional statistics for each time period.
    
    Returns:
        pd.Series: A series containing the time-series averages of the cross-sectional statistics.
    """
    # Exclude the time column for averaging
    stats_to_average = stats_df.drop(columns=['Time'])
    
    # Calculate the time-series averages
    time_series_averages = stats_to_average.mean()
    
    df = pd.DataFrame(time_series_averages).T

    return df