In [None]:
# pip install pandas
# pip install torch==2.3.0 torchvision==0.18.0
# pip install openpyxl

In [32]:
#import torch
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, fisher_exact


In [None]:
df = pd.read_csv('data/FGF14_pb_may_22.csv', delimiter=';', header=0)
print(df.head())

In [None]:
column_names = pd.DataFrame(df.columns, columns = ['Names'])
print(column_names.to_string(index=False))

### **Features Type**

In [None]:
counts = []

for column in df.columns:
    num_unique_values = df[column].nunique()
    counts.append([column, num_unique_values])

counts = pd.DataFrame(counts, columns=['Variable', 'Number_Unique'])
print(counts.to_string(index=False))

In [None]:
def categorize_var(number_unique):
    if number_unique <= 2:
        return "Binary"
    elif number_unique == 3:
        return "Group"
    else:
        return "Continuous"
    
counts['Variable_Type'] = counts['Number_Unique'].apply(categorize_var)
counts.at[0, 'Variable_Type'] = 'ID'
print(counts.to_string(index=False))

In [None]:
continuous_vars = counts[counts['Variable_Type'] == 'Continuous']['Variable'].tolist()
binary_vars = counts[counts['Variable_Type'] == 'Binary']['Variable'].tolist()
print(df.shape, len(continuous_vars), len(binary_vars))  

### **Summary Continuous Features**

In [None]:
columns_to_subset = ['genetics_conclusion'] + continuous_vars
df_continuous = df[columns_to_subset].copy()
df_continuous[continuous_vars] = df_continuous[continuous_vars].apply(pd.to_numeric, errors='coerce')
df_continuous

In [132]:
def calculate_stats_continuous(df, subset_column, subset_value):
    subset_df = df[df[subset_column] == subset_value]
    summary_stats_data = []

    for column in subset_df.columns:
        if pd.api.types.is_numeric_dtype(subset_df[column]):  
            mean = subset_df[column].mean()
            sd = subset_df[column].std()
            summary_stats_data.append({'Column': column, 'Mean': mean, 'SD': sd})

    summary_stats_df = pd.DataFrame(summary_stats_data)
    summary_stats_df['Mean ± SD'] = summary_stats_df.apply(lambda row: f"{row['Mean']:.2f} ± {row['SD']:.2f}", axis=1)
    summary_stats_df.drop(['Mean', 'SD'], axis=1, inplace=True)
    return summary_stats_df

In [135]:
result_FGF = calculate_stats_continuous(df_continuous, 'genetics_conclusion', 'FGF')
result_Neg = calculate_stats_continuous(df_continuous, 'genetics_conclusion', 'Negative')
result_MSA = calculate_stats_continuous(df_continuous, 'genetics_conclusion', 'MSA')

result_FGF.to_csv('data/result_FGF_cont.csv')
result_Neg.to_csv('data/result_Neg_cont.csv')
result_MSA.to_csv('data/result_MSA_cont.csv')

### **Summary Binary Features**

In [None]:
columns_to_subset = ['genetics_conclusion'] + binary_vars
df_binary = df[columns_to_subset].copy()
df_binary[binary_vars] = df_binary[binary_vars].apply(pd.to_numeric, errors='coerce')
df_binary

In [137]:
def calculate_stats_binary(df, subset_column, subset_value):
    subset_df = df[df[subset_column] == subset_value]
    summary_stats_data = []

    for column in subset_df.columns:
        if pd.api.types.is_numeric_dtype(subset_df[column]) and subset_df[column].nunique() <= 2:
            sum_ = subset_df[column].sum()
            percentage = (sum_ / len(subset_df)) * 100
            summary_stats_data.append({'Column': column, 'Sum': sum_, 'Percentage': percentage})

    summary_stats_df = pd.DataFrame(summary_stats_data)
    summary_stats_df['n (%%)'] = summary_stats_df.apply(lambda row: f"{row['Sum']} ({row['Percentage']:.2f}%)", axis=1)
    summary_stats_df.drop(['Sum', 'Percentage'], axis=1, inplace=True)

    return summary_stats_df

In [146]:
result_FGF = calculate_stats_binary(df_binary, 'genetics_conclusion', 'FGF')
result_Neg = calculate_stats_binary(df_binary, 'genetics_conclusion', 'Negative')
result_MSA = calculate_stats_binary(df_binary, 'genetics_conclusion', 'MSA')

result_FGF.to_csv('data/result_FGF_bin.csv')
result_Neg.to_csv('data/result_Neg_bin.csv')
result_MSA.to_csv('data/result_MSA_bin.csv')

### **Mann Whitney U**

In [None]:
def perform_mann_whitney(df, continuous_columns, condition_column, g_1, g_2 ):
    results = []
    
    for col in continuous_columns:
        try:
            group_1 = df[df[condition_column] == g_1][col].dropna()
            group_2 = df[df[condition_column] == g_2][col].dropna()
            _, p_value = mannwhitneyu(group_1, group_2, alternative='two-sided')
            results.append((col, p_value.round(4)))

        except Exception as e:
            print(f"Error occurred for column '{col}': {e}")
            continue

    results_df = pd.DataFrame(results, columns=['Column', 'P-Value'])
    return results_df



mann_whit_fgf_neg = perform_mann_whitney(df_continuous, continuous_vars, 'genetics_conclusion', 'FGF', 'Negative')
mann_whit_fgf_msa = perform_mann_whitney(df_continuous, continuous_vars, 'genetics_conclusion', 'FGF', 'MSA')

mann_whit_fgf_neg.to_csv('data/mann_whit_fgf_neg.csv')
mann_whit_fgf_msa.to_csv('data/mann_whit_fgf_msa.csv')

### **Fisher's Exact**

In [48]:
def perform_fishers_exact(df, binary_columns, condition_column, g_1, g_2):
    results = []
    
    filtered_df = df[(df[condition_column] == g_1) | (df[condition_column] == g_2)]

    for col in binary_columns:
        try:
            group_1 = filtered_df[filtered_df[condition_column] == g_1][col]
            group_2 = filtered_df[filtered_df[condition_column] == g_2][col]
            
            if not (group_1.sum() == 0 and group_2.sum() == 0):
                contingency_table = pd.crosstab(filtered_df[condition_column], filtered_df[col])
                odds_ratio, p_value = fisher_exact(contingency_table)
                p_value = round(p_value, 4)
                results.append((col, p_value))
        except Exception as e:
            print(f"Error occurred for column '{col}': {e}")
            continue

    results_df = pd.DataFrame(results, columns=['Column', 'P-Value'])
    
    return results_df


fisher_fgf_neg = perform_fishers_exact(df_binary, binary_vars, 'genetics_conclusion', 'FGF', 'Negative')
fisher_fgf_msa = perform_fishers_exact(df_binary, binary_vars, 'genetics_conclusion', 'MSA', 'Negative')

fisher_fgf_neg = fisher_fgf_neg.rename(columns={"P-Value": "P-Value_Neg"})
fisher_fgf_msa = fisher_fgf_msa.rename(columns={"P-Value": "P-Value_MSA"})

merged_df = pd.merge(fisher_fgf_neg, fisher_fgf_msa, on='Column', how='left')
merged_df
merged_df.to_csv('data/merged_df.csv')


Error occurred for column 'dx_gait_balance_impair': The input `table` must be of shape (2, 2).
