<h2 style='font-family: Agency FB; font-weight: 600; font-size: 20px; text-align: left'>1.0. Import Required Libraries</h2>

In [26]:
from scipy.stats import  skew, kurtosis 
from scipy.stats import kruskal
import scipy.stats as stats  
import pandas as pd  
import numpy as np 
import math  

import warnings  
warnings.simplefilter("ignore")  
pd.set_option('display.max_columns', 10) 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

<h2 style='font-family: Agency FB; font-weight: 600; font-size: 20px; text-align: left'>2.0. Load and Inspect Dataset</h2>

In [19]:
DATA_PATH = "../Datasets/Eggplant Fusarium Fresistance Data.csv"
df = pd.read_csv(DATA_PATH)
display(df)

Unnamed: 0,Variety,Resistance Level,Replication ID,Infection Severity (%),Wilt index,Plant height (cm),Days to wilt symptoms,Survival rate (%),Disease incidence (%)
0,EP-R1,Resistant,1,22.50,0.70,88.90,21,88.80,23.40
1,EP-R1,Resistant,2,27.90,1.20,82.20,19,87.70,21.70
2,EP-R1,Resistant,3,21.20,0.00,74.70,17,84.90,27.20
3,EP-R1,Resistant,4,15.50,0.10,93.80,18,90.30,15.00
4,EP-R1,Resistant,5,17.30,0.90,78.10,19,87.00,23.00
...,...,...,...,...,...,...,...,...,...
795,EP-S3,Susceptible,96,75.20,3.60,68.20,7,6.40,85.50
796,EP-S3,Susceptible,97,74.80,4.90,59.50,4,27.20,82.00
797,EP-S3,Susceptible,98,58.10,3.60,78.80,7,30.80,75.40
798,EP-S3,Susceptible,99,54.10,4.10,63.70,7,24.10,81.80


<h2 style='font-family: Agency FB; font-size: 20px; font-weight: 600'>3.0: Dataset Informartion Overviews</h2>

In [20]:
df.shape

(800, 9)

In [21]:
for column in df.columns.tolist(): print(f"{'-'*15} {column}")

--------------- Variety
--------------- Resistance Level
--------------- Replication ID
--------------- Infection Severity (%)
--------------- Wilt index
--------------- Plant height (cm)
--------------- Days to wilt symptoms
--------------- Survival rate (%)
--------------- Disease incidence (%)


In [22]:
def column_summary(df):
    summary_data = []
    for col_name in df.columns:
        col_dtype = df[col_name].dtype
        num_of_nulls = df[col_name].isnull().sum()
        num_of_non_nulls = df[col_name].notnull().sum()
        num_of_distinct_values = df[col_name].nunique()
        
        if num_of_distinct_values <= 10:
            distinct_values_counts = df[col_name].value_counts().to_dict()
        else:
            top_10_values_counts = df[col_name].value_counts().head(10).to_dict()
            distinct_values_counts = {k: v for k, v in sorted(top_10_values_counts.items(), key=lambda item: item[1], reverse=True)}

        summary_data.append({
            'col_name': col_name,
            'col_dtype': col_dtype,
            'num_of_nulls': num_of_nulls,
            'num_of_non_nulls': num_of_non_nulls,
            'num_of_distinct_values': num_of_distinct_values,
            'distinct_values_counts': distinct_values_counts
        })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

summary_df = column_summary(df)
display(summary_df)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,Variety,object,0,800,8,"{'EP-R1': 100, 'EP-R2': 100, 'EP-R3': 100, 'EP..."
1,Resistance Level,object,0,800,3,"{'Resistant': 300, 'Susceptible': 300, 'Modera..."
2,Replication ID,int64,0,800,100,"{1: 8, 2: 8, 3: 8, 4: 8, 5: 8, 6: 8, 7: 8, 8: ..."
3,Infection Severity (%),float64,0,800,473,"{21.4: 6, 22.6: 6, 21.2: 6, 19.1: 6, 21.3: 5, ..."
4,Wilt index,float64,0,800,51,"{5.0: 31, 4.3: 26, 4.0: 25, 0.5: 24, 0.9: 24, ..."
5,Plant height (cm),float64,0,800,345,"{72.0: 9, 85.6: 6, 72.8: 6, 85.1: 6, 79.1: 6, ..."
6,Days to wilt symptoms,int64,0,800,20,"{7: 115, 8: 66, 6: 64, 19: 60, 12: 58, 18: 53,..."
7,Survival rate (%),float64,0,800,488,"{100.0: 7, 90.4: 6, 85.0: 6, 86.9: 5, 29.0: 5,..."
8,Disease incidence (%),float64,0,800,492,"{82.7: 6, 78.9: 5, 84.0: 5, 24.0: 4, 83.5: 4, ..."


<h2 style='font-family: Agency FB; font-size: 20px;  font-weight: 600'>4.0: Grouped Summary Statics</h2>

In [23]:
def summary_statistics(df, group_col=None):
    results = []
    if group_col: grouped = df.groupby(group_col)
    else: grouped = [(None, df)]

    for group_name, group_df in grouped:
        for col in group_df.select_dtypes(include=[np.number]).columns:
            mean = group_df[col].mean()
            median = group_df[col].median()
            mode = group_df[col].mode().iloc[0] if not group_df[col].mode().empty else np.nan
            std_dev = group_df[col].std()
            variance = group_df[col].var()
            value_range = group_df[col].max() - group_df[col].min()
            skewness_val = skew(group_df[col], nan_policy='omit')  # Skewness
            kurtosis_val = kurtosis(group_df[col], nan_policy='omit')  # Kurtosis
    
            results.append({
                'Group': group_name if group_col else 'Overall',
                'Parameter': col,
                'Mean': mean,
                'Median': median,
                'Mode': mode,
                'Std. Deviation': std_dev,
                'Variance': variance,
                'Range': value_range,
                'Skewness': skewness_val,
                'Kurtosis': kurtosis_val
            })
    
        
    result_df = pd.DataFrame(results)
    return result_df

results = summary_statistics(df, group_col='Variety')
display(results.head(10))

Unnamed: 0,Group,Parameter,Mean,Median,Mode,Std. Deviation,Variance,Range,Skewness,Kurtosis
0,EP-M1,Replication ID,50.5,50.5,1.0,29.01,841.67,99.0,0.0,-1.2
1,EP-M1,Infection Severity (%),44.16,44.75,53.9,10.58,111.87,45.8,-0.36,-0.42
2,EP-M1,Wilt index,2.56,2.55,2.5,0.72,0.52,3.8,0.44,0.57
3,EP-M1,Plant height (cm),74.44,74.2,78.4,5.06,25.56,28.3,-0.05,0.4
4,EP-M1,Days to wilt symptoms,11.99,12.0,12.0,1.45,2.11,7.0,-0.04,-0.3
5,EP-M1,Survival rate (%),54.57,54.3,33.5,9.98,99.63,42.5,0.08,-0.53
6,EP-M1,Disease incidence (%),50.54,51.05,53.6,7.93,62.82,35.2,-0.25,-0.45
7,EP-M2,Replication ID,50.5,50.5,1.0,29.01,841.67,99.0,0.0,-1.2
8,EP-M2,Infection Severity (%),45.44,45.15,48.9,10.08,101.53,52.6,0.09,0.07
9,EP-M2,Wilt index,2.51,2.5,2.5,0.7,0.49,4.2,-0.09,0.43


In [24]:
def grouped_summary_stats(df: pd.DataFrame, group: str):
    Metrics = df.select_dtypes(include=np.number).columns.tolist()
    df1 = df.drop(columns=[group])
    grand_mean = df1[Metrics].mean()
    sem = df1[Metrics].sem()
    cv = df1[Metrics].std() / df1[Metrics].mean() * 100
    grouped = df.groupby(group)[Metrics].agg(['mean', 'sem']).reset_index()
    
    summary_df = pd.DataFrame()
    for col in Metrics:
        summary_df[col] = grouped.apply(
            lambda x: f"{x[(col, 'mean')]:.2f} ± {x[(col, 'sem')]:.2f}", axis=1
        )
    
    summary_df.insert(0, group, grouped[group])
    grand_mean_row = ['Grand Mean'] + grand_mean.tolist()
    sem_row = ['SEM'] + sem.tolist()
    cv_row = ['%CV'] + cv.tolist()
    
    summary_df.loc[len(summary_df)] = grand_mean_row
    summary_df.loc[len(summary_df)] = sem_row
    summary_df.loc[len(summary_df)] = cv_row
    
    return summary_df

results = grouped_summary_stats(df, group='Variety')
results

Unnamed: 0,Variety,Replication ID,Infection Severity (%),Wilt index,Plant height (cm),Days to wilt symptoms,Survival rate (%),Disease incidence (%)
0,EP-M1,50.50 ± 2.90,44.16 ± 1.06,2.56 ± 0.07,74.44 ± 0.51,11.99 ± 0.15,54.57 ± 1.00,50.54 ± 0.79
1,EP-M2,50.50 ± 2.90,45.44 ± 1.01,2.51 ± 0.07,75.18 ± 0.44,11.85 ± 0.16,55.77 ± 1.00,51.75 ± 1.02
2,EP-R1,50.50 ± 2.90,20.69 ± 0.43,0.73 ± 0.05,84.80 ± 0.59,17.97 ± 0.19,89.25 ± 0.52,25.84 ± 0.72
3,EP-R2,50.50 ± 2.90,20.81 ± 0.46,0.83 ± 0.05,85.33 ± 0.60,17.98 ± 0.20,90.49 ± 0.46,26.17 ± 0.78
4,EP-R3,50.50 ± 2.90,20.89 ± 0.42,0.82 ± 0.05,84.84 ± 0.58,18.51 ± 0.20,89.61 ± 0.47,25.25 ± 0.70
5,EP-S1,50.50 ± 2.90,75.26 ± 1.08,4.20 ± 0.06,65.22 ± 0.50,6.82 ± 0.12,24.14 ± 0.80,81.24 ± 0.75
6,EP-S2,50.50 ± 2.90,73.99 ± 0.91,4.11 ± 0.05,65.47 ± 0.54,6.91 ± 0.10,24.66 ± 0.89,79.57 ± 0.65
7,EP-S3,50.50 ± 2.90,73.91 ± 0.89,4.18 ± 0.06,64.71 ± 0.53,6.76 ± 0.12,24.44 ± 0.75,80.79 ± 0.61
8,Grand Mean,50.50,46.89,2.49,75.00,12.35,56.62,52.64
9,SEM,1.02,0.87,0.06,0.36,0.18,1.04,0.88


<h1 style='font-family: Agency FB; font-size: 20px; font-weight: 600'>5.0: Kruskal Wallis Test</h1>

In [25]:
def kruskall_wallis(df, group_columns: str, numerical_columns: list = None):
    if numerical_columns is None:
        numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
        for g in group_columns:
            if g in numerical_columns:
                numerical_columns.remove(g)
    results = []
    for group_column in group_columns:
        for column in numerical_columns:
            groups = [group[column].dropna().values for name, group in df.groupby(group_column)]
            stats, p_value = kruskal(*groups)
            interpretation = 'Significant' if p_value < 0.05 else 'Not Significant'
            results.append({
                'Group': group_column,
                'Variables': column,
                'Kruskal-Wallis Statistic': stats,
                'P-value': p_value,
                'Significant (α<0.05)': interpretation
            })
    return pd.DataFrame(results)

if __name__ == "__main__":
    group_columns = ['Variety', 'Resistance Level']                     # List of categorical columns or factors
    numerical_columns = df.select_dtypes(include=[np.number]).columns   # List of numerical variables
    results = kruskall_wallis(df, group_columns, numerical_columns)     # Perform test Kruskall Wallis Test
    pd.set_option('display.float_format', lambda x: '%.4f' % x)         # Display Configuration
    display(results)

Unnamed: 0,Group,Variables,Kruskal-Wallis Statistic,P-value,Significant (α<0.05)
0,Variety,Replication ID,0.0,1.0,Not Significant
1,Variety,Infection Severity (%),682.4852,0.0,Significant
2,Variety,Wilt index,669.3773,0.0,Significant
3,Variety,Plant height (cm),597.986,0.0,Significant
4,Variety,Days to wilt symptoms,701.1451,0.0,Significant
5,Variety,Survival rate (%),698.1517,0.0,Significant
6,Variety,Disease incidence (%),692.2784,0.0,Significant
7,Resistance Level,Replication ID,0.0,1.0,Not Significant
8,Resistance Level,Infection Severity (%),682.1712,0.0,Significant
9,Resistance Level,Wilt index,668.7051,0.0,Significant


---

This analysis was performed by **Jabulente**, a passionate and dedicated data analyst with a strong commitment to using data to drive meaningful insights and solutions. For inquiries, collaborations, or further discussions, please feel free to reach out via.  

----

<div align="center">  
    
[![GitHub](https://img.shields.io/badge/GitHub-Jabulente-black?logo=github)](https://github.com/Jabulente)  [![LinkedIn](https://img.shields.io/badge/LinkedIn-Jabulente-blue?logo=linkedin)](https://linkedin.com/in/jabulente-208019349)  [![Email](https://img.shields.io/badge/Email-jabulente@hotmail.com-red?logo=gmail)](mailto:Jabulente@hotmail.com)  

</div>


<h1 style='font-size: 20px; color: red; font-family: French Script MT; font-weight: 700; text-align: center'>Data to Drive Meaningful Insights and Solutions</h1>