In [4]:
#importing all the necessary libraries
import numpy as np
import pandas as pd
from scipy.stats import chi2

In [5]:
df = pd.read_csv('/kaggle/input/120-years-olympics-history/athlete_events.csv')

In [6]:
# first few columns of dataset
df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [7]:
def little_mcar_test(data):
    """
    Perform Little's MCAR test on a pandas dataframe.

    Parameters:
    data (DataFrame): The dataset to test for MCAR (missing completely at random).

    Returns:
    dict: A dictionary containing the chi-square statistic, degrees of freedom, and p-value.
    """

    # Get the number of rows in the data
    n = len(data)

    # List to hold binary masks indicating missing values for each column
    groups = []

    # Loop through each column in the dataframe
    for col in data.columns:
        # Create a boolean mask where True indicates a missing value
        mask = data[col].isnull()

        # If there are any missing values in this column
        if mask.any():
            # Convert the mask to an integer (1 for missing, 0 for not missing)
            # and reshape it to make sure it's a column vector
            groups.append(mask.astype(int).values.reshape(-1, 1))

    # If no missing data was found in the dataset, raise an error
    if len(groups) == 0:
        raise ValueError("No missing data found.")

    # Concatenate all the binary masks column-wise to create a matrix
    # This matrix will indicate which values are missing across columns
    r = np.concatenate(groups, axis=1)

    # Compute the "group_stats" matrix, which is essentially the dot product of
    # the transpose of r with r itself. This captures how often missing values
    # coincide across columns.
    group_stats = r.T @ r

    # The number of groups (columns with missing data)
    m = len(groups)

    # Degrees of freedom is computed as (n - 1) * m, where n is the number of rows
    # and m is the number of columns with missing values
    df = (n - 1) * m

    # The chi-square statistic is the trace of the group_stats matrix
    # (i.e., the sum of the diagonal elements)
    chi2_stat = group_stats.trace()

    # Compute the p-value using the chi-square survival function (sf), which gives
    # the probability of observing a chi-square statistic at least as extreme as the
    # one calculated, under the null hypothesis that data is MCAR
    p_value = chi2.sf(chi2_stat, df)

    # Return the results as a dictionary
    return {"chi2_stat": chi2_stat, "degrees_of_freedom": df, "p_value": p_value}


In [8]:
result = little_mcar_test(df)


Printing the results using formatted strings

In [9]:
print(f"Chi-square statistic: {result['chi2_stat']}")
print(f"Degrees of freedom: {result['degrees_of_freedom']}")
print(f"P-value: {result['p_value']}")

Chi-square statistic: 363853
Degrees of freedom: 1084460
P-value: 1.0


**Interpretation:**
Chi-square statistic: The chi-square statistic measures how much the pattern of missing data deviates from what would be expected if the data were missing completely at random. A higher value indicates greater deviation.

**Degrees of freedom:** The degrees of freedom (1,084,460 in this case) reflect the complexity of the test, based on the number of variables and the size of the dataset.

**P-value:** The p-value is a key part of interpreting the test. In this case, the p-value is 1.0, which is extremely high. A p-value near 1 indicates strong evidence in favor of the null hypothesis, which in this case is that the data is missing completely at random (MCAR).

**Conclusion:**

*   Since the p-value is 1.0, this suggests that the missing data is very likely to be missing completely at random (MCAR). In other words, the pattern of missingness does not appear to be related to any other variables in the dataset
*   The chi-square statistic is high, but given the large degrees of freedom, this is not surprising. The test essentially compares observed vs. expected patterns of missingness, and the p-value indicates that any deviations from MCAR are statistically insignificant.

