<h1 style='font-family: Dalton White; font-weight: 600; font-size: 20px; text-align: left'>1.0. Import Required Libraries</h1>

In [1]:
from scipy.stats import pearsonr
import scipy.stats as stats
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', 10)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

print("\n----- Libraries Loaded Successfully -----\n")


----- Libraries Loaded Successfully -----



<h1 style='font-family: Dalton White; font-weight: 600; font-size: 20px; text-align: left'>2.0. Load and Inspect Dataset</h1>

In [2]:
FILEPATH = "./Datasets/Soil Survey Dataset 2025.csv"
df = pd.read_csv(FILEPATH)
display(df)

Unnamed: 0,Site,Ph,Organic Matter %,Nitrogen %,Phosphorus Mg/Kg,Potassium Cmol(+)/Kg,Cec Cmol(+)/Kg,Bulk Density G/Cm3,Electrical Conductivity Ds/M,Yield T/Ha
0,Lowland,6.35,2.45,0.27,20.07,0.47,12.04,1.17,0.18,5.64
1,Lowland,6.44,2.23,0.22,4.31,0.30,6.17,1.24,0.22,4.65
2,Lowland,6.33,2.51,0.15,20.35,0.25,13.03,1.21,0.19,4.40
3,Lowland,6.35,1.97,0.11,5.49,0.35,2.61,1.11,0.21,3.53
4,Lowland,5.86,2.38,0.24,10.12,0.46,11.52,1.20,0.09,4.76
...,...,...,...,...,...,...,...,...,...,...
315,Hillside,6.76,2.30,0.11,6.38,0.28,11.70,1.33,0.25,3.64
316,Hillside,5.54,2.53,0.20,8.42,0.22,10.23,1.31,0.18,4.10
317,Hillside,6.16,1.82,0.18,16.55,0.19,8.71,1.29,0.14,4.20
318,Hillside,6.14,1.48,0.16,1.68,0.28,7.01,1.31,0.32,3.14


In [5]:
def column_summaries(df: pd.DataFrame) -> pd.DataFrame:
    summary_data = []
    for col_name in df.columns:
        col_dtype = df[col_name].dtype
        num_of_nulls = df[col_name].isnull().sum()
        num_of_non_nulls = df[col_name].notnull().sum()
        num_of_distinct_values = df[col_name].nunique()
        
        if num_of_distinct_values <= 10:
            distinct_values_counts = df[col_name].value_counts().to_dict()
        else:
            top_10_values_counts = df[col_name].value_counts().head(10).to_dict()
            distinct_values_counts = {k: v for k, v in sorted(top_10_values_counts.items(), key=lambda item: item[1], reverse=True)}

        summary_data.append({
            'col_name': col_name,
            'col_dtype': col_dtype,
            'num_of_nulls': num_of_nulls,
            'num_of_non_nulls': num_of_non_nulls,
            'num_of_distinct_values': num_of_distinct_values,
            'distinct_values_counts': distinct_values_counts
        })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

if __name__ == "__main__":
    summary_df = column_summaries(df)
    display(summary_df)

Unnamed: 0,col_name,col_dtype,num_of_nulls,num_of_non_nulls,num_of_distinct_values,distinct_values_counts
0,Site,object,0,320,4,"{'Lowland': 80, 'Upland': 80, 'Riverine': 80, ..."
1,Ph,float64,0,320,320,"{5.933228114814507: 1, 6.020721227425298: 1, 5..."
2,Organic Matter %,float64,0,320,320,"{1.709687739036262: 1, 1.017733123757243: 1, 2..."
3,Nitrogen %,float64,0,320,320,"{0.1338622988365825: 1, 0.1556925471573393: 1,..."
4,Phosphorus Mg/Kg,float64,0,320,317,"{0.0: 4, 5.49154959069914: 1, 10.1209017459529..."
5,Potassium Cmol(+)/Kg,float64,0,320,320,"{0.4650672467557676: 1, 0.3842455089419334: 1,..."
6,Cec Cmol(+)/Kg,float64,0,320,320,"{6.3439350334633176: 1, 4.446053838740792: 1, ..."
7,Bulk Density G/Cm3,float64,0,320,320,"{1.1795347491202997: 1, 1.3487566896868544: 1,..."
8,Electrical Conductivity Ds/M,float64,0,320,320,"{0.1490615324175119: 1, 0.1151092847315608: 1,..."
9,Yield T/Ha,float64,0,320,320,"{3.771501912265703: 1, 2.919546106533629: 1, 3..."


<h1 style='font-family: Dalton White; font-weight: 600; font-size: 20px; text-align: left'>2. Statistics Description of Datasets</h1>

In [4]:
if __name__ == "__main__":
    results = df.describe(include='all').T
    display(results)

Unnamed: 0,count,unique,top,freq,mean,...,min,25%,50%,75%,max
Site,320.0,4.0,Lowland,80.0,,...,,,,,
Ph,320.0,,,,6.22,...,4.86,5.96,6.2,6.5,7.18
Organic Matter %,320.0,,,,2.36,...,0.54,1.91,2.3,2.83,4.46
Nitrogen %,320.0,,,,0.18,...,0.03,0.15,0.18,0.22,0.4
Phosphorus Mg/Kg,320.0,,,,12.7,...,0.0,8.23,12.35,17.03,27.49
Potassium Cmol(+)/Kg,320.0,,,,0.36,...,0.02,0.27,0.36,0.45,0.76
Cec Cmol(+)/Kg,320.0,,,,9.99,...,0.78,7.96,10.03,11.99,18.27
Bulk Density G/Cm3,320.0,,,,1.25,...,0.97,1.19,1.24,1.31,1.51
Electrical Conductivity Ds/M,320.0,,,,0.22,...,0.02,0.16,0.21,0.27,0.48
Yield T/Ha,320.0,,,,4.43,...,2.56,3.88,4.41,5.0,6.79


<h1 style='font-family: Dalton White; font-weight: 600; font-size: 20px; text-align: left'>3.0. Calculate Pearsonr Correlatin Coefficients</h1>

In [7]:
def correlation_matrix(df, method="pearson", visualize=True):
    if method not in [
        'pearson', 
        'spearman', 
        'kendall']: 
        raise ValueError("Invalid method. Choose from 'pearson', 'spearman', or 'kendall'.")
    numeric_df = df.select_dtypes(include=['number'])
    corr_matrix = numeric_df.corr(method=method)
    return corr_matrix

if __name__ == "__main__":
    corr_matrix = correlation_matrix(df, method="pearson")
    display(corr_matrix)

Unnamed: 0,Ph,Organic Matter %,Nitrogen %,Phosphorus Mg/Kg,Potassium Cmol(+)/Kg,Cec Cmol(+)/Kg,Bulk Density G/Cm3,Electrical Conductivity Ds/M,Yield T/Ha
Ph,1.0,0.2,0.16,0.27,0.2,0.08,-0.21,0.21,0.5
Organic Matter %,0.2,1.0,0.2,0.34,0.19,0.07,-0.25,0.22,0.6
Nitrogen %,0.16,0.2,1.0,0.23,0.17,-0.0,-0.29,0.2,0.73
Phosphorus Mg/Kg,0.27,0.34,0.23,1.0,0.3,0.11,-0.25,0.24,0.51
Potassium Cmol(+)/Kg,0.2,0.19,0.17,0.3,1.0,0.07,-0.27,0.2,0.27
Cec Cmol(+)/Kg,0.08,0.07,-0.0,0.11,0.07,1.0,-0.05,-0.0,0.08
Bulk Density G/Cm3,-0.21,-0.25,-0.29,-0.25,-0.27,-0.05,1.0,-0.17,-0.43
Electrical Conductivity Ds/M,0.21,0.22,0.2,0.24,0.2,-0.0,-0.17,1.0,0.28
Yield T/Ha,0.5,0.6,0.73,0.51,0.27,0.08,-0.43,0.28,1.0


In [6]:
from scipy.stats import pearsonr

def compute_pearson_r(df: pd.DataFrame, numerical_columns: list) -> pd.DataFrame:
    results = []

    for i, col1 in enumerate(numerical_columns):
        for col2 in numerical_columns[i+1:]:
            r_value, p_value = pearsonr(df[col1], df[col2])
            direction = ("Positive" if r_value > 0 else  "Negative" if r_value < 0 else "No correlation")
            strength = ("Strong" if abs(r_value) >= 0.7 else "Moderate" if abs(r_value) >= 0.3 else "Weak")

            results.append({
                'Variable 1': col1, 'Variable 2': col2,
                'Pearson\'s r': r_value, 'P-value': p_value,
                'Direction': direction, 'Strength': strength
            })
    
    return pd.DataFrame(results)

if __name__ == "__main__":
    variables = [
         'Organic Matter %',
         'Nitrogen %',
         'Phosphorus Mg/Kg',
         'Potassium Cmol(+)/Kg',
         'Cec Cmol(+)/Kg',
         'Bulk Density G/Cm3',
         'Electrical Conductivity Ds/M',
         'Yield T/Ha'
        ]
    results = compute_pearson_r(df, numerical_columns=variables)
    display(results)


Unnamed: 0,Variable 1,Variable 2,Pearson's r,P-value,Direction,Strength
0,Organic Matter %,Nitrogen %,0.2,0.0,Positive,Weak
1,Organic Matter %,Phosphorus Mg/Kg,0.34,0.0,Positive,Moderate
2,Organic Matter %,Potassium Cmol(+)/Kg,0.19,0.0,Positive,Weak
3,Organic Matter %,Cec Cmol(+)/Kg,0.07,0.24,Positive,Weak
4,Organic Matter %,Bulk Density G/Cm3,-0.25,0.0,Negative,Weak
5,Organic Matter %,Electrical Conductivity Ds/M,0.22,0.0,Positive,Weak
6,Organic Matter %,Yield T/Ha,0.6,0.0,Positive,Moderate
7,Nitrogen %,Phosphorus Mg/Kg,0.23,0.0,Positive,Weak
8,Nitrogen %,Potassium Cmol(+)/Kg,0.17,0.0,Positive,Weak
9,Nitrogen %,Cec Cmol(+)/Kg,-0.0,0.99,Negative,Weak
