In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, rankdata

## Ch2. Summary Statistics

It is important for presentation of summary statistics represents a trade-off between showing enough results to give the reader and not presenting so much that the reader is overwhelmed.
In this part, show the typical summary statistis tor cross-sectional distribution

In [None]:
# The summary statistics procedure consists of two sterps: 
# 1. For each time period $t$, certain charaacteristics of cross-sectional distribution of the given variable, $X$, are calculated.
# 2. The time series properties of the periodic cross-sectional characteristics are calculated. 
#   The most importanr time series properties are the mean
#   The mean means the average value of the cross-sectional characteristic over time.

# Statistics for univariate distributions of the varaibles used in a study

In [None]:
#The details of the first step are as follows. For each time period t, we calculate the cross-sectional mean, standard deviation, skewness, excess kurtosis, minimum value, median value, maximum value, and selected additional 
#percentiles of the distribution of the values of X, where each of these statistics is calculated over all available values of X in period t. 
#We let Meant be the mean, SDt denote the sample standard deviation, Ske𝑤t represent the sample skewness, Kurtt be the sample excess kurtosis, Mint be the minimum value, Mediant denote the median value, and Maxt represent the 
#maximum value of X in period t. In addition, we will record the fifth, 25th, 75th, and 95th percentiles of X in month t, which we denote P5t, P25t, P75t, and P95t, respectively. 
#Depending on the data and the objective of the study, it may be desirable to include additional percentiles of the distribution. 
#For example, if the study focuses on extreme values of X, then it may be valuable to record the first, second, third, fourth, 96th, 97th, 98th, and 99th percentiles of the distribution as well.
#Alternatively, calculating the minimum, maximum, fifth percentile, and 95th percentile of the data may not be necessary if the data are reasonably well behaved. 
#Exactly which statistics to record and present is a decision made by the researcher, who, presumably, has a much deeper understanding of the data than could possibly be presented in a research article.
#In addition to these statistics describing the time t cross-sectional distribution of X, we also record the number of entities for which a valid value of X is available in period t and denote this number nt.

In [None]:
# The objectives in analyzing the summary statistics are twofold. First, the summary statistics are intended to give a basic overview of the cross-sectional properties 
# of the variables that will be used in the study. This is useful for understanding the types of entities that comprise the sample. 
# Second, the summary statistics can be used to identify any potential issues that may arise when using these variables in statistical analyses.

First Stage

In [None]:
def cal_cs_stats(df, time_column, value_column, additional_percentiles=False):
    """
    Calculate cross-sectional statistics for each time period.
    
    Args:
        df (pd.DataFrame): The data frame containing the data.
        time_column (str): The name of the column representing time periods.
        value_column (str): The name of the column representing the values of X.
        additional_percentiles (list of float): Additional percentiles to calculate (optional).
    
    Returns:
        pd.DataFrame: A data frame containing the calculated statistics for each time period.
    """
    if additional_percentiles:
        additional_percentiles_list = [0.01, 0.02, 0.03, 0.04, 0.96, 0.97, 0.98, 0.99]

    # Function to calculate the required statistics for a given period
    def calc_period_stats(group, additional_percentiles=additional_percentiles):
        stats = {
            'Time': group[time_column].iloc[0],  # Time column
            'Mean': group[value_column].mean(),
            'SD': group[value_column].std(),
            'Skew': skew(group[value_column], nan_policy='omit'),
            'Kurt': kurtosis(group[value_column], nan_policy='omit'),
            'Min': group[value_column].min(),
            'Median': group[value_column].median(),
            'Max': group[value_column].max(),
            'P5': group[value_column].quantile(0.05),
            'P25': group[value_column].quantile(0.25),
            'P75': group[value_column].quantile(0.75),
            'P95': group[value_column].quantile(0.95),
            'N': group[value_column].count()
        }
        
        if additional_percentiles:
            for percentile in additional_percentiles_list:
                stats[f'P{int(percentile*100)}'] = group[value_column].quantile(percentile)
        
        return pd.Series(stats)
    
    # Group the data by the time column and apply the function
    stats_df = df.groupby(time_column).apply(calc_period_stats).reset_index(drop=True)
    
    
    return stats_df

Second Stage

caculate the time-series averages of the periodic cross-sectional values.
    $Mean$: time-series average of the values of $Mean_t$ over all periods $t$ in the sample.

Caculate the time-series means of the corss-sectional summary statistics

In [None]:
def cal_ts_mean(stats_df):
    """
    Calculate the time-series averages of the cross-sectional statistics.
    
    Args:
        stats_df (pd.DataFrame): A data frame containing cross-sectional statistics for each time period.
    
    Returns:
        pd.Series: A series containing the time-series averages of the cross-sectional statistics.
    """
    # Exclude the time column for averaging
    stats_to_average = stats_df.drop(columns=['Time'])
    
    # Calculate the time-series averages
    time_series_averages = stats_to_average.mean()
    
    df = pd.DataFrame(time_series_averages).T

    return df