In [13]:
import pandas as pd

# 加載數據集
file_path = 'C:/Users/b0983/OneDrive/桌面/類別資料分析/Obesity Risk/train.csv'  # 請替換為你的文件路徑
data = pd.read_csv(file_path)

# 將目標變數映射到數值類別
target_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
data['NObeyesdad'] = data['NObeyesdad'].map(target_mapping)

# 計算每個類別變數的數量和百分比
def calculate_counts_and_percentages(df, column, target):
    counts = df.groupby([target, column]).size().unstack().fillna(0).astype(int)
    percentages = counts.div(counts.sum(axis=0), axis=1) * 100
    return counts, percentages

# 定義變數
categorical_vars = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

# 計算結果
results = {}
for var in categorical_vars:
    counts, percentages = calculate_counts_and_percentages(data, var, 'NObeyesdad')
    results[var] = {'counts': counts, 'percentages': percentages}

# 輸出結果
for var, result in results.items():
    print(f"變數名稱: {var}")
    print("數量:")
    print(result['counts'])
    print("百分比:")
    print(result['percentages'])
    print("\n")


變數名稱: Gender
數量:
Gender      Female  Male
NObeyesdad              
0             1621   902
1             1660  1422
2             1070  1357
3              755  1767
4             1267  1643
5                8  3240
6             4041     5
百分比:
Gender         Female       Male
NObeyesdad                      
0           15.553637   8.726780
1           15.927845  13.757740
2           10.266743  13.128870
3            7.244291  17.095588
4           12.156976  15.895898
5            0.076761  31.346749
6           38.773748   0.048375


變數名稱: family_history_with_overweight
數量:
family_history_with_overweight    no   yes
NObeyesdad                                
0                               1449  1074
1                               1292  1790
2                                756  1671
3                                193  2329
4                                 48  2862
5                                  5  3243
6                                  1  4045
百分比:
family_history_with_o

In [14]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, chi2_contingency, kruskal
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

# 計算連續型變數之間的Pearson相關係數
continuous_vars = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
correlation_matrix = data[continuous_vars].corr(method='pearson')

# 計算VIF值
X = data[continuous_vars]
X = StandardScaler().fit_transform(X)
vif_data = pd.DataFrame()
vif_data["feature"] = continuous_vars
vif_data["VIF"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# 顯示連續變數的Pearson相關係數和VIF值
print("Pearson相關係數矩陣:")
print(correlation_matrix)
print("\nVIF值:")
print(vif_data)


Pearson相關係數矩陣:
             Age    Height    Weight      FCVC       NCP      CH2O       FAF  \
Age     1.000000 -0.011713  0.283381  0.034414 -0.048479 -0.016325 -0.192259   
Height -0.011713  1.000000  0.416677 -0.071546  0.191383  0.183706  0.295278   
Weight  0.283381  0.416677  1.000000  0.245682  0.095947  0.317914 -0.084845   
FCVC    0.034414 -0.071546  0.245682  1.000000  0.113349  0.101299 -0.089822   
NCP    -0.048479  0.191383  0.095947  0.113349  1.000000  0.080949  0.100871   
CH2O   -0.016325  0.183706  0.317914  0.101299  0.080949  1.000000  0.082932   
FAF    -0.192259  0.295278 -0.084845 -0.089822  0.100871  0.082932  1.000000   
TUE    -0.296154  0.076433 -0.086471 -0.147843  0.067459 -0.010654  0.021213   

             TUE  
Age    -0.296154  
Height  0.076433  
Weight -0.086471  
FCVC   -0.147843  
NCP     0.067459  
CH2O   -0.010654  
FAF     0.021213  
TUE     1.000000  

VIF值:
  feature       VIF
0     Age  1.251635
1  Height  1.504341
2  Weight  1.667031
3    F

In [15]:
# 計算類別型變數之間的卡方檢定和Cramer's V值
categorical_vars = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
chi2_results = []
cramers_v_results = []

for var1 in categorical_vars:
    for var2 in categorical_vars:
        if var1 != var2:
            contingency_table = pd.crosstab(data[var1], data[var2])
            chi2, p, dof, ex = chi2_contingency(contingency_table)
            n = contingency_table.sum().sum()
            phi2 = chi2/n
            r, k = contingency_table.shape
            cramers_v = np.sqrt(phi2 / min(k-1, r-1))
            chi2_results.append((var1, var2, chi2, p))
            cramers_v_results.append((var1, var2, cramers_v))

chi2_results_df = pd.DataFrame(chi2_results, columns=['Var1', 'Var2', 'Chi2', 'P'])
cramers_v_results_df = pd.DataFrame(cramers_v_results, columns=['Var1', 'Var2', 'Cramer\'s V'])

print("\n卡方檢定結果:")
print(chi2_results_df)
print("\nCramer's V值:")
print(cramers_v_results_df)


卡方檢定結果:
                              Var1                            Var2  \
0                           Gender  family_history_with_overweight   
1                           Gender                            FAVC   
2                           Gender                            CAEC   
3                           Gender                           SMOKE   
4                           Gender                             SCC   
5                           Gender                            CALC   
6                           Gender                          MTRANS   
7   family_history_with_overweight                          Gender   
8   family_history_with_overweight                            FAVC   
9   family_history_with_overweight                            CAEC   
10  family_history_with_overweight                           SMOKE   
11  family_history_with_overweight                             SCC   
12  family_history_with_overweight                            CALC   
13  family_

In [17]:
# 轉換二元類別型變數為數值型
binary_vars = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
data[binary_vars] = data[binary_vars].apply(lambda x: x.map({'yes': 1, 'no': 0, 'Male': 1, 'Female': 0}))

# 計算連續型變數與二元類別型變數之間的點二系列相關係數
continuous_vars = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
point_biserial_results = []

for binary_var in binary_vars:
    for continuous_var in continuous_vars:
        r, p = pearsonr(data[continuous_var], data[binary_var])
        point_biserial_results.append((binary_var, continuous_var, r, p))

point_biserial_df = pd.DataFrame(point_biserial_results, columns=['Binary Var', 'Continuous Var', 'Correlation', 'P'])

print("\n點二系列相關係數結果:")
print(point_biserial_df)


點二系列相關係數結果:
                        Binary Var Continuous Var  Correlation              P
0                           Gender            Age     0.066530   8.357109e-22
1                           Gender         Height     0.623507   0.000000e+00
2                           Gender         Weight     0.118613   6.410444e-66
3                           Gender           FCVC    -0.312766   0.000000e+00
4                           Gender            NCP     0.039468   1.283082e-08
5                           Gender           CH2O     0.053298   1.541628e-14
6                           Gender            FAF     0.238399  4.374810e-266
7                           Gender            TUE     0.058696   2.596156e-17
8   family_history_with_overweight            Age     0.261575  7.608611e-322
9   family_history_with_overweight         Height     0.228137  3.024488e-243
10  family_history_with_overweight         Weight     0.514147   0.000000e+00
11  family_history_with_overweight           FCVC  

In [18]:
# 計算連續型變數與三類別以上的類別型變數之間的Kruskal-Wallis檢定
multi_category_vars = ['MTRANS']
kruskal_results = []

for multi_var in multi_category_vars:
    for continuous_var in continuous_vars:
        groups = [group[continuous_var].values for name, group in data.groupby(multi_var)]
        stat, p = kruskal(*groups)
        kruskal_results.append((multi_var, continuous_var, stat, p))

kruskal_df = pd.DataFrame(kruskal_results, columns=['Multi Category Var', 'Continuous Var', 'Statistic', 'P'])

print("\nKruskal-Wallis檢定結果:")
print(kruskal_df)


Kruskal-Wallis檢定結果:
  Multi Category Var Continuous Var    Statistic              P
0             MTRANS            Age  4331.272509   0.000000e+00
1             MTRANS         Height   163.563359   2.515371e-34
2             MTRANS         Weight   224.631414   1.888854e-47
3             MTRANS           FCVC   254.912592   5.691009e-54
4             MTRANS            NCP    86.230125   8.316742e-18
5             MTRANS           CH2O    88.567223   2.653463e-18
6             MTRANS            FAF   224.991928   1.579807e-47
7             MTRANS            TUE   925.237855  5.666484e-199


In [19]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 連續型變數
continuous_vars = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
# One-Hot Encoding後的變數
encoded_vars = pd.get_dummies(data['MTRANS'], prefix='MTRANS')

# 合併數據
data_vif = pd.concat([data[continuous_vars], encoded_vars], axis=1)

# 計算VIF值
vif_data = pd.DataFrame()
vif_data["feature"] = data_vif.columns
vif_data["VIF"] = [variance_inflation_factor(data_vif.values, i) for i in range(data_vif.shape[1])]

print(vif_data)


                         feature         VIF
0                            Age    2.081760
1                         Height    1.544147
2                         Weight    1.797170
3                           FCVC    1.161324
4                            NCP    1.070560
5                           CH2O    1.143642
6                            FAF    1.217090
7                            TUE    1.138149
8              MTRANS_Automobile  109.417421
9                    MTRANS_Bike    1.970972
10              MTRANS_Motorbike    2.147783
11  MTRANS_Public_Transportation  474.075085
12                MTRANS_Walking   14.775276
