In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

# Utils

In [None]:
def analyze_distribution(data, feature, label):
    # Vẽ Histogram và KDE
    plt.figure(figsize=(12, 6))
    sns.histplot(data[feature], kde=True, label=f'Label {label}', bins=30)
    plt.title(f'Distribution of Feature {feature} for Label {label}')
    plt.legend()
    plt.show()

    # Vẽ QQ-plot cho phân phối chuẩn
    plt.figure(figsize=(12, 6))
    stats.probplot(data[feature], dist="norm", plot=plt)
    plt.title(f'QQ-Plot of Feature {feature} for Label {label} with Normal Distribution')
    plt.show()

    # Kiểm định phân phối chuẩn (Normal)
    mean, std = data[feature].mean(), data[feature].std()
    stat, p_value = stats.kstest(data[feature], 'norm', args=(mean, std))
    print(f'KS-test for Normal distribution (Feature {feature}, Label {label}): Statistic = {stat}, p-value = {p_value}')
    if p_value > 0.05:
        print(f"Data seems to follow a Normal distribution (Feature {feature}, Label {label}).")
        print(f'Estimated parameters: Mean = {mean}, Std = {std}')
    else:
        print(f"Data does not follow a Normal distribution (Feature {feature}, Label {label}).")

    # Vẽ QQ-plot cho phân phối t-Student
    plt.figure(figsize=(12, 6))
    stats.probplot(data[feature], dist="t", sparams=(len(data)-1,), plot=plt)
    plt.title(f'QQ-Plot of Feature {feature} for Label {label} with t-Student Distribution')
    plt.show()

    # Kiểm định phân phối t-Student
    loc, scale = stats.t.fit(data[feature])
    stat, p_value = stats.kstest(data[feature], 't', args=(loc, scale))
    print(f'KS-test for t-Student distribution (Feature {feature}, Label {label}): Statistic = {stat}, p-value = {p_value}')
    if p_value > 0.05:
        print(f"Data seems to follow a t-Student distribution (Feature {feature}, Label {label}).")
        print(f'Estimated parameters: Location = {loc}, Scale = {scale}')
    else:
        print(f"Data does not follow a t-Student distribution (Feature {feature}, Label {label}).")

    # Vẽ QQ-plot cho phân phối Exponential
    plt.figure(figsize=(12, 6))
    stats.probplot(data[feature], dist="expon", sparams=(data[feature].min(), data[feature].mean()-data[feature].min()), plot=plt)
    plt.title(f'QQ-Plot of Feature {feature} for Label {label} with Exponential Distribution')
    plt.show()

    # Kiểm định phân phối Exponential
    loc, scale = stats.expon.fit(data[feature])
    stat, p_value = stats.kstest(data[feature], 'expon', args=(loc, scale))
    print(f'KS-test for Exponential distribution (Feature {feature}, Label {label}): Statistic = {stat}, p-value = {p_value}')
    if p_value > 0.05:
        print(f"Data seems to follow an Exponential distribution (Feature {feature}, Label {label}).")
        print(f'Estimated parameters: Location = {loc}, Scale = {scale}')
    else:
        print(f"Data does not follow an Exponential distribution (Feature {feature}, Label {label}).")

    # Vẽ QQ-plot cho phân phối Poisson
    plt.figure(figsize=(12, 6))
    stats.probplot(data[feature], dist="poisson", sparams=(np.mean(data[feature]),), plot=plt)
    plt.title(f'QQ-Plot of Feature {feature} for Label {label} with Poisson Distribution')
    plt.show()

    # Kiểm định phân phối Poisson
    lambda_ = np.mean(data[feature])
    poisson_stat, poisson_p_value = stats.kstest(data[feature], 'poisson', args=(lambda_,))
    print(f'KS-test for Poisson distribution (Feature {feature}, Label {label}): Statistic = {poisson_stat}, p-value = {poisson_p_value}')
    if poisson_p_value > 0.05:
        print(f"Data seems to follow a Poisson distribution (Feature {feature}, Label {label}).")
        print(f'Estimated parameter: Lambda = {lambda_}')
    else:
        print(f"Data does not follow a Poisson distribution (Feature {feature}, Label {label}).")

    # Vẽ QQ-plot cho phân phối Gamma
    plt.figure(figsize=(12, 6))
    shape, loc, scale = stats.gamma.fit(data[feature])
    stats.probplot(data[feature], dist="gamma", sparams=(shape, loc, scale), plot=plt)
    plt.title(f'QQ-Plot of Feature {feature} for Label {label} with Gamma Distribution')
    plt.show()

    # Kiểm định phân phối Gamma
    shape, loc, scale = stats.gamma.fit(data[feature])
    gamma_stat, gamma_p_value = stats.kstest(data[feature], 'gamma', args=(shape, loc, scale))
    print(f'KS-test for Gamma distribution (Feature {feature}, Label {label}): Statistic = {gamma_stat}, p-value = {gamma_p_value}')
    if gamma_p_value > 0.05:
        print(f"Data seems to follow a Gamma distribution (Feature {feature}, Label {label}).")
        print(f'Estimated parameters: Shape = {shape}, Location = {loc}, Scale = {scale}')
    else:
        print(f"Data does not follow a Gamma distribution (Feature {feature}, Label {label}).")

    # Vẽ QQ-plot cho phân phối Beta
    plt.figure(figsize=(12, 6))
    a, b, loc, scale = stats.beta.fit(data[feature])
    stats.probplot(data[feature], dist="beta", sparams=(a, b, loc, scale), plot=plt)
    plt.title(f'QQ-Plot of Feature {feature} for Label {label} with Beta Distribution')
    plt.show()

    # Kiểm định phân phối Beta
    a, b, loc, scale = stats.beta.fit(data[feature])
    beta_stat, beta_p_value = stats.kstest(data[feature], 'beta', args=(a, b, loc, scale))
    print(f'KS-test for Beta distribution (Feature {feature}, Label {label}): Statistic = {beta_stat}, p-value = {beta_p_value}')
    if beta_p_value > 0.05:
        print(f"Data seems to follow a Beta distribution (Feature {feature}, Label {label}).")
        print(f'Estimated parameters: a = {a}, b = {b}, Location = {loc}, Scale = {scale}')
    else:
        print(f"Data does not follow a Beta distribution (Feature {feature}, Label {label}).")

    # Vẽ QQ-plot cho phân phối Log-Normal
    plt.figure(figsize=(12, 6))
    shape, loc, scale = stats.lognorm.fit(data[feature], floc=0)
    stats.probplot(data[feature], dist="lognorm", sparams=(shape, loc, scale), plot=plt)
    plt.title(f'QQ-Plot of Feature {feature} for Label {label} with Log-Normal Distribution')
    plt.show()

    # Kiểm định phân phối Log-Normal
    shape, loc, scale = stats.lognorm.fit(data[feature], floc=0)
    lognorm_stat, lognorm_p_value = stats.kstest(data[feature], 'lognorm', args=(shape, loc, scale))
    print(f'KS-test for Log-Normal distribution (Feature {feature}, Label {label}): Statistic = {lognorm_stat}, p-value = {lognorm_p_value}')
    if lognorm_p_value > 0.05:
        print(f"Data seems to follow a Log-Normal distribution (Feature {feature}, Label {label}).")
        print(f'Estimated parameters: Shape = {shape}, Location = {loc}, Scale = {scale}')
    else:
        print(f"Data does not follow a Log-Normal distribution (Feature {feature}, Label {label}).")

    # Vẽ QQ-plot cho phân phối Weibull
    plt.figure(figsize=(12, 6))
    shape, loc, scale = stats.weibull_min.fit(data[feature])
    stats.probplot(data[feature], dist="weibull_min", sparams=(shape, loc, scale), plot=plt)
    plt.title(f'QQ-Plot of Feature {feature} for Label {label} with Weibull Distribution')
    plt.show()

    # Kiểm định phân phối Weibull
    shape, loc, scale = stats.weibull_min.fit(data[feature])
    weibull_stat, weibull_p_value = stats.kstest(data[feature], 'weibull_min', args=(shape, loc, scale))
    print(f'KS-test for Weibull distribution (Feature {feature}, Label {label}): Statistic = {weibull_stat}, p-value = {weibull_p_value}')
    if weibull_p_value > 0.05:
        print(f"Data seems to follow a Weibull distribution (Feature {feature}, Label {label}).")
        print(f'Estimated parameters: Shape = {shape}, Location = {loc}, Scale = {scale}')
    else:
        print(f"Data does not follow a Weibull distribution (Feature {feature}, Label {label}).")

# Data

In [None]:
import os

os.chdir('/home/haipn/Clone/Research_ThyroidFNA_ClassAI/phase2_280824/data/processed')

In [None]:
# Đọc dữ liệu từ các file CSV
train_df = pd.read_csv('train_features.csv')
valid_df = pd.read_csv('valid_features.csv')
test_df = pd.read_csv('test_features.csv')

# Danh sách các đặc trưng và nhãn
features = ['dim_0', 'dim_1', 'dim_2']
labels = [0, 1, 2]

# Main

In [None]:
# Phân tích cho từng dataset, feature và label
datasets = {'Train': train_df, 'Valid': valid_df, 'Test': test_df}

for dataset_name, dataset in datasets.items():
    print(f'Analyzing dataset: {dataset_name}')
    for feature in features:
        for label in labels:
            data = dataset[dataset['label'] == label]
            analyze_distribution(data, feature, label)