Code by Dr. Martin Ross(1), modified from previous versions that were written with the assistance of Cindy Liu(2) <br>
ChapGPT (GPT-4) was also used to fix some issues and improve a few things <br>
(1) Associate Professor, Earth and Environmental Sciences <br>
(2) Coop student, Physics & Astronomy <br>
University of Waterloo, Canada

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import math
import pandas as pd
from scipy.stats import norm

In [None]:
#The default structure is samples (rows) and weight frequencies (cols) from coarse (left) to fine (right)
#The grain size headers and grain size values are in millimetres (see note below for laser data in microns)
#A minimum of two samples/rows are expected
#Rename file and adjust the skiprows and column range as needed

#NOTES:
#For laser data in microns, there is an option below [cell 9] to adjust phival calculations

#For sieve data, the wt. Freq. of the pan fraction in the input file should be extrapolated and distributed across the full range of silt and clay in a decreasing fashion
#This can be improved by analyzing the fine fraction with a another method like laser diffractomery

# Read the entire file into a list of lines
with open("Normal_grain_size_data.txt", 'r') as file:
    lines = file.readlines()

# Determine the number of columns in the third row (excluding the first column)
first_data_row = lines[2].strip().split()
num_columns = len(first_data_row)

# Initialize lists to store data
allFreq = []
grainSizesData = []

# Process the file line by line
for i, line in enumerate(lines):
    # Skip the first two rows for 'allFreq' and 'allLabels'
    if i >= 2:
        columns = line.strip().split()
        allFreq.append(columns[1:num_columns])

    # Extract grain sizes from the second row
    if i == 1:
        grainSizesData = line.strip().split()[0:num_columns]

# Convert `allFreq` to a numpy array and transpose it
allFreq = np.array(allFreq, dtype=float).transpose()

# Desired number of decimal places for rounding
decimal_places = 4

# Round grain size labels in mm
grainSizes = [round(float(size), decimal_places) for size in grainSizesData]

# Extract sample IDs from the third row onwards
allLabels = np.array([line.strip().split()[0] for line in lines[2:]], dtype='str')

#print("allFreq:\n", allFreq)
#print("grainSizes:", grainSizes)
#print("allLabels:", allLabels)


In [None]:
np.shape(allFreq)

In [None]:
print(grainSizes)

In [None]:
def cumulative(allSamples, start, end):
    # inputs: (list of lists, int, int)
    sampleCumulative = [[]] * (end - start)
    
    for i in range(start, end):
        sampleCumulative[i - start] = allSamples[:, i]
        sampleCumulative[i - start] = np.cumsum(sampleCumulative[i - start])
        
    return sampleCumulative

def cumulativeCurve(allSamples, start, end, xSize, ySize):
    if allSamples.size == 0:
        print("The dataset is empty")
        return

    cumulativeVals = cumulative(allSamples, start, end)
    
    for i in range(len(cumulativeVals)):
        color = '#465775' if (i % 2) == 0 else '#ef6f6c'
        fig, ax = plt.subplots(1, 1, figsize=(xSize, ySize))
        plt.plot(range(len(grainSizes)), cumulativeVals[i], 
                 color=color, linewidth=2.5, marker='o')
        plt.xticks(np.arange(0, len(phiVals)), labels=phiVals, fontsize=14)
        plt.ylim((-0.5, 105))
        ax.set_xlabel('phi', fontsize=14)
        ax.set_ylabel('Cumulative wt. Frequency %', fontsize=14)
        ax.tick_params(axis='y', labelsize=14)
        ax.set_title('Cumulative wt. Frequency of Sample ' + str(allLabels[start + i]), 
                     pad=20, fontsize=18)
        plt.show()


In [None]:
#This gives the option of presenting the cumulative curve on a normal probability scale (y axis)

def cumulativeCurveProb(allSamples, start, end, xSize, ySize):
    # inputs: (list of lists, int, int, int, int)
    
    cumulativeVals = cumulative(allSamples, start, end)
    
    # Define fixed y-ticks for probability scale (e.g., 0.1%, 1%, 5%, 10%, 25%, 50%, 75%, 90%, 95%, 99%)
    fixed_percentiles = np.array([0.1, 1, 5, 10, 25, 50, 75, 90, 95, 99])
    y_ticks = norm.ppf(fixed_percentiles / 100.0)
    y_tick_labels = [f'{percentile:.1f}%' for percentile in fixed_percentiles]
    
    # Define fixed y-axis limits
    y_min = norm.ppf(0.0001)  # Minimum value corresponding to 0.01%
    y_max = norm.ppf(0.9999)   # Maximum value corresponding to 99.99%
    
    for i in range(len(cumulativeVals)):
        if (i % 2) == 0:
            colour = '#465775'
        else:
            colour = '#ef6f6c'
        
        fig, ax = plt.subplots(1, 1, figsize=(xSize, ySize))
        
        # Transform cumulative values to a probability scale
        cumulative_percentage = np.array(cumulativeVals[i])
        prob_scale_values = norm.ppf(cumulative_percentage / 100.0)
        
        plt.plot(range(len(grainSizes)), prob_scale_values, 
                 color=colour, linewidth=2.5, marker='o')
        plt.xticks(np.arange(0, len(phiVals)), labels=phiVals, fontsize=14)

        # Set y-axis to probability scale
        ax.set_yscale('linear')

        # Apply fixed y-ticks and labels
        ax.set_yticks(y_ticks)
        ax.set_yticklabels(y_tick_labels)
        
        # Set fixed y-axis limits
        ax.set_ylim(y_min, y_max)
        
        ax.set_xlabel('phi', fontsize=14)
        ax.set_ylabel('Cumulative Probability (%)', fontsize=14)
        ax.tick_params(axis='y', labelsize=14)
        ax.set_title('Cumulative Probability of Sample ' + str(allLabels[start + i]), 
                     pad=20, fontsize=18)
        plt.grid(True, which='both', axis='both', linestyle='--', linewidth=0.5)
        plt.show()

In [None]:
def cumulativeCurve2(allSamples, start, end, xSize, ySize):
    
    cumulativeVals = cumulative(allSamples, start, end)
    
    # Define a list of colors and line styles
    colors = ['#A0522D', '#708090', '#2E8B57', '#000000', '#FFA500']
    line_styles = ['-', '--', '-.', ':', (0, (3, 5, 1, 5))]
    markers = ['.', 'o', 'x', '+', '^']
    
    # Create a single figure and axis
    fig, ax = plt.subplots(1, 1, figsize=(xSize, ySize))
    
    for i, (color, style, marker) in enumerate(zip(colors, line_styles, markers)):
        if i >= len(cumulativeVals):
            break  # Exit loop if there are more styles than curves
        
        # Plot the cumulative curve on the same axis with unique style and color
        ax.plot(range(len(grainSizes)), cumulativeVals[i], 
                color=color, linestyle=style, linewidth=2.5, marker=marker, 
                markersize=8,  # Adjust marker size here
                label=f'{allLabels[start+i]}')
    
    plt.xticks(np.arange(0, len(phiVals)), labels=phiVals, fontsize=14)
    plt.ylim((-0.5, 105))
    ax.set_xlabel('phi', fontsize=14)
    ax.set_ylabel('Cumulative wt. Frequency %', fontsize=14)
    ax.tick_params(axis='y', labelsize=14)
    ax.set_title('Cumulative wt. Frequency of Selected Samples', 
                 pad=20, fontsize=18)
    ax.legend()  # Show legend with sample labels
    
    plt.show()


In [None]:
# calculating phi values
grainSizesFloat = []

for i in grainSizes[:]:
    grainSizesFloat.append(float(i))

phiVals = []

for i in grainSizesFloat:
    phi = -float(round(math.log(i,2),2))
    if phi == -0.0:
        phi = 0.0
    phiVals.append(phi)

#Use this one for laser data in microns
#for i in grainSizesFloat:
    #phi = -float(round(math.log(i/1000,2), 2))
    #if phi == -0.0:
        #phi = 0.0
    #phiVals.append(phi)

print(phiVals)

In [None]:
# functions for verbal representations of skew and standard deviation
def verbalSkew(skewVal):
    if skewVal > 0.3:
        skewText = "Strongly fine-skewed"
    elif skewVal >= 0.1:
        skewText = "Fine skewed" 
    elif skewVal >= -0.1:
        skewText = "Near symmetrical"
    elif skewVal >=-0.3:
        skewText = "Coarse skewed"
    else:
        skewText = "Strongly coarse skewed"
    return skewText

def verbalStdev(stdev):
    if stdev >4.00:
        stdevText = "Extremely poorly sorted"
    elif stdev >=2.0:
        stdevText = "Very poorly sorted"
    elif stdev >=1.00:
        stdevText = "Poorly sorted"
    elif stdev >=0.71:
        stdevText = "Moderately sorted"
    elif stdev >=0.50:
        stdevText = "Moderately well sorted"
    elif stdev >=0.35:
        stdevText = "Well sorted"
    else:
        stdevText = "Very well sorted"  
    return stdevText

In [None]:
# function for skewness, mean, and stdev
def skewMeanStdev(cumulativeVals):
    
    # inputs: (list)
    
    skews = []
    means = []
    stdevs = []
    
    for i in range(len(cumulativeVals)):
        phi5p = np.interp(5, cumulativeVals[i], phiVals)
        phi16p = np.interp(16, cumulativeVals[i], phiVals)
        phi50p = np.interp(50, cumulativeVals[i], phiVals)
        phi84p = np.interp(84, cumulativeVals[i], phiVals)
        phi95p = np.interp(95, cumulativeVals[i], phiVals)
        
        skew = ((phi84p+phi16p-(2*phi50p))/(2*(phi84p-phi16p))) + ((phi95p+phi5p-(2*phi50p))/(2*(phi95p-phi5p)))
        mean = (phi16p+phi50p+phi84p)/3
        stdev = ((phi84p-phi16p)/4) + ((phi95p-phi5p)/6.6)
        
        skews.append(skew)
        means.append(mean)
        stdevs.append(stdev)
        
    return [skews, means, stdevs]  

In [None]:
def createDataFrame(allSamples, allLabels, start, end):
    cumulativeVals = cumulative(allSamples, start, end)
    statsVals = skewMeanStdev(cumulativeVals)
    skews = statsVals[0]
    means = statsVals[1]
    stdevs = statsVals[2]

    df = pd.DataFrame({'Sample ID': allLabels[start:end], 
                       'Mean': means, 
                       'Skewness': skews, 
                       'Standard Deviation': stdevs})
    
    # add verbal skewness and standard deviation columns
    df['Verbal Skewness'] = df['Skewness'].apply(verbalSkew)
    df['Verbal Stdev'] = df['Standard Deviation'].apply(verbalStdev)
    
    return df

In [None]:
createDataFrame(allFreq,allLabels, 0, allFreq.shape[1])

In [None]:
# function to generate weight frequency distribution graphs 
def wtHist(allSamples, start, end, xSize, ySize):
    
    '''
    inputs: (list of lists, int, int, list of 
            lists, int, int)
            
    start>end
    '''
    adjust = start
    if start > 5:
        adjust -= 1
    
    specifiedSamples = [[]] * (end-start)
    for i in range(start,end):
        specifiedSamples[i-start] = allSamples[:,i]
    
    sampleCount = start
    
    for i in specifiedSamples:
        if (sampleCount % 2) == 0:
            colour = '#4464ad'
        else:
            colour = '#a4b0f5'
        fig, ax = plt.subplots(1,1,figsize=(xSize,ySize))
        ax.bar(np.arange(len(i)), i, color=colour, alpha=0.8)
        ax.plot(np.arange(len(i)), i, color='#f58f29', 
                marker='o', linewidth=0)
        plt.xticks(np.arange(0,len(grainSizes))+0.5, labels=grainSizes)
        ax.set_xlabel('Grain Size (mm)', fontsize=14)
        ax.set_ylabel('Frequency (wt. %)', fontsize=14)
        ax.tick_params(axis='y', labelsize=14)
        ax.set_title('Weight Frequency Distribution of Sample '
                     + str(allLabels[sampleCount]), pad=20, fontsize=18)
        plt.show()
        sampleCount+=1

In [None]:
# weight frequency distribution of samples 1-5
wtHist(allFreq,0,2, 14,8)


In [None]:
# Call the function to create the figures and save each figure to the PDF file
cumulativeCurve(allFreq, 0, 2, 10, 7) #cumulative curves of grain sizes in samples 1-5

In [None]:
# Call the function to create the figures and save each figure to the PDF file
cumulativeCurveProb(allFreq, 0, 2, 10, 10) #cumulative curves of grain sizes in samples 1-5

In [None]:
# Call the function to create the figures and save each figure to the PDF file
cumulativeCurve2(allFreq, 0, 2, 10, 7) #cumulative curves of grain sizes in samples 1-5