In [31]:
import torch
import random
import os
import pandas as pd
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import balanced_accuracy_score
import numpy as np
def wavelengthFilter(string):
    '''
    This function removes the um suffix from the wavelength data.

    Inputs
    ------
    string: A string to remove um from.

    Returns
    -------
    string: A float
    '''
    string=string.removesuffix(" um")
    return float(string)
def oneHotEncoding(combination):
    '''
    This function will turn molecule combinations into a one-hot encoded vector

    Inputs
    ------
    combination: Tuple containing the abundances of the molecules in this order: "O2","N2","H2","CO2","H2O","CH4","NH3"

    Returns
    -------
    vector: One-hot encoded vector of 1's and 0's
    '''

    #The order of the molecules are: "O2","N2","H2","CO2","H2O","CH4","NH3"
    vector=[0.]*7
    for i,abundance in enumerate(combination):
        #At what point should a molecule be considered present? I don't know need to think about that
        if abundance>0.001:
            vector[i]=1.0
    return torch.tensor(vector) 
def getLabel(filePath,specialMolecules=False):
    configFolder=r"C:\Users\Tristan\Downloads\HyPCAR3\configFiles"
    # configFolder="/home/tristanb/scratch/configFiles"
    filePath=filePath.removesuffix(".csv")
    configFilePath=os.path.join(configFolder,filePath)
    configFilePath+=".txt"

    lines=[]
    with open(configFilePath) as f:
        for line in f:
            lines.append(line)


    abundances=lines[54]
    abundances=abundances.removeprefix("<ATMOSPHERE-LAYER-1>")
    abundances=abundances.split(",")
    
    if not specialMolecules:
        abundances=list(map(float,abundances[2:]))#Remove temperature profile information
        label=oneHotEncoding(abundances)
        return label
    else:
        abundances=list(map(float,abundances[2:9]))#Only gets target values, not background moolecules or 
        label=oneHotEncoding(abundances)
        return label

class customDataset(Dataset):
    def __init__(self,samples):#samples contain a listt of all file paths
        self.samples=samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self,index):
        filePath,label=self.samples[index]



        #Extract data from file
        data=pd.read_csv(filePath)
        wavelength=list(map(wavelengthFilter,data.iloc[:,0]))#Removes um from wavelength data
        transmittance=list(data.iloc[:,1])
        # if len(wavelength)!=784:
        #     print(filePath)
        # print(len(wavelength))
        combinedData=torch.tensor(list(zip(wavelength, transmittance)), dtype=torch.float32)
        # if torch.isnan(combinedData).any():
        #     print(filePath)
        return combinedData,label
class detectionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1=nn.Conv1d(in_channels=2, out_channels=128, kernel_size=5, stride=2)
        self.bn1=nn.BatchNorm1d(128)
        self.pool1=nn.MaxPool1d(2)

        self.conv2=nn.Conv1d(in_channels=128,out_channels=128,kernel_size=5,stride=2)
        self.bn2=nn.BatchNorm1d(128)
        self.pool2=nn.MaxPool1d(2)

        self.conv3=nn.Conv1d(in_channels=128,out_channels=64,kernel_size=3,stride=2)
        self.bn3=nn.BatchNorm1d(64)
        self.pool3=nn.MaxPool1d(2)

        self.conv4=nn.Conv1d(in_channels=64,out_channels=32,kernel_size=2,stride=2)
        self.bn4=nn.BatchNorm1d(32)
        self.pool4=nn.MaxPool1d(2)

        self.dropout1=nn.Dropout(0.4)

        self.flatten=nn.Flatten()

        self.fc1=nn.Linear(64,128)
        self.dropout2=nn.Dropout(0.75)
        self.fc2=nn.Linear(128,64)
        self.fc3=nn.Linear(64,7)#7 molecule present

    def forward(self,x):
        # Permute dimensions to [batch_size, channels, sequence_length]
        x=x.permute(0, 2, 1)
        x=F.relu(self.bn1(self.conv1(x)))
        x=self.pool1(x)

        x=F.relu(self.bn2(self.conv2(x)))
        x=self.pool2(x)

        x=F.relu(self.bn3(self.conv3(x)))
        x=self.pool3(x)

        x=F.relu(self.bn4(self.conv4(x)))
        x=self.pool4(x)

        x=self.dropout1(x)

        x=torch.flatten(x, 1)

        x=F.relu(self.fc1(x))
        x=self.dropout2(x)
        
        x=F.relu(self.fc2(x))
        x=torch.sigmoid(self.fc3(x))
        
        return x

random.seed(42)


testingData=[]


allSamples=[]
allLabels=[]
#

testSplit=0.15
for atmosphereType in ["A","B","C","None"]:
    print(atmosphereType)
    curFolderPath=r"C:\Users\Tristan\Downloads\HyPCAR3\data"
    curFolderPath+="\\"+atmosphereType
    files=[]
    for path in os.listdir(curFolderPath):

        if atmosphereType=="None":
            #Gett labels in a special way
            label=getLabel(path,True)
        else:

            label=getLabel(path)
        files.append((os.path.join(curFolderPath,path),label))
        

    random.shuffle(files)

    testingSamples=[]
    for i,data in enumerate(files):
        path,label=data[0],data[1]
        if i<(len(files)*testSplit):#Adds testing data
            testingSamples.append((path,label))
        else:
            break
            #Don't need more
    testingData.extend(testingSamples)

print("Data loaded")


random.shuffle(testingData)
testingDataset=customDataset(testingData)
testingDataloader=DataLoader(testingDataset,batch_size=32,shuffle=True)#Testing data loader

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
detect=detectionModel()
detect=detect.to(device)
detect.load_state_dict(torch.load(r"C:\Users\Tristan\Downloads\HyPCAR3\detectionModel.pt",weights_only=True))

allPred=[]
allLabel=[]
for batch in testingDataloader:
    data,labels=batch
    data,labels=data.to(device),labels.to(device)

    with torch.no_grad():
        outputs=detect(data)
    preds = (outputs > 0.5)
    preds=preds.to(device)
    allLabel.append(labels)
    allPred.append(preds)

allLabel=np.concatenate([a.cpu().numpy() if hasattr(a, "cpu") else a for a in allLabel],axis=0)
allPred=np.concatenate([a.cpu().numpy() if hasattr(a, "cpu") else a for a in allPred],axis=0)


numMolecules=allLabel.shape[1]
balancedAcc=[]

for i in range(numMolecules):
    ba=balanced_accuracy_score(allLabel[:, i], allPred[:, i])
    balancedAcc.append(ba)

avgBalancedAcc=np.mean(balancedAcc)

print("Balanced accuracies per molecule:", balancedAcc)
print("Average balanced accuracy:", avgBalancedAcc)


A
B
C
None
Data loaded
Balanced accuracies per molecule: [0.9993334726337428, 0.9241394889370538, 0.8921204284672583, 0.9981716463516261, 0.9956473829201102, 0.9971436785919647, 0.9990546181853248]
Average balanced accuracy: 0.9722301022981544


In [26]:
import os
import random
import matplotlib.pyplot as plt
import pandas as pd


def getAbundances(fileName):
    '''
    This function grabs the molecule abundances from the config files and returns them as a vector

    Inputs
    ------
    fileName: The name of the config file.

    Returns
    -------
    abundances: A vector containing the abundance information
    '''
    abundances=[0.0]*7
    moleculeNames=["O2", "N2", "CO2", "H2O", "N2O", "CH4", "H2S"]
    lines=[]
    with open(fileName) as f:
        for line in f:
            lines.append(line)

    abundances=lines[54]
    abundances=abundances.removeprefix("<ATMOSPHERE-LAYER-1>")
    abundances=abundances.split(",")

    if "None" in os.path.basename(fileName):
        #Special case
        abundances=list(map(float,abundances[2:9]))#Only gets target values, not background moolecules or 

    else:
        abundances=list(map(float,abundances[2:]))#Remove temperature profile information
    return abundances

configs=[file for file in os.listdir(r"C:\Users\Tristan\Downloads\HyPCAR3\configFiles")]

sample=random.sample(configs,1)
dataFolder=r"C:\Users\Tristan\Downloads\HyPCAR3\data"
configFolder=r"C:\Users\Tristan\Downloads\HyPCAR3\configFiles"
for file in sample:
    baseName=os.path.basename(file)
    baseName=baseName.removesuffix(".txt")
    baseName+=".csv"
    folder=baseName.split("_")[0]
    if len(folder)==2:
        folder="A"
    elif len(folder)>1:
        folder="None"

    newPath=dataFolder+f"\\{folder}"
    newPath+=f"\\{baseName}"
    data=pd.read_csv(newPath)
    w,t=list(data.iloc[0:,0]),list(data.iloc[0:,1])
  
    with open("tempTransmittance.csv","w") as f:
        for i in range(len(w)):
            f.write(str(w[i][:-3])+","+str(t[i])+"\n")
    
    print(["O2","N2","H2","CO2","H2O","CH4","NH3"])
    print(getAbundances(os.path.join(configFolder,file)))
    print(newPath)



['O2', 'N2', 'H2', 'CO2', 'H2O', 'CH4', 'NH3']
[0.0, 0.3819293813457363, 0.0, 0.0, 0.33511101651338593, 0.2576119023030262, 0.0253476998378517]
C:\Users\Tristan\Downloads\HyPCAR3\data\A\A2_9933.csv


In [1]:
import numpy as np

# Data in percentages from the table
molecules = ["O2", "N2", "CO2", "H2O", "N2O", "CH4", "H2S"]

# True values (using 0 for H2S as "Not Present")
true = np.array([20.9, 78.1, 0.03795, 0.35004, 0.000032, 0.00017, 0.0])

# Predictions from the normal model
normal = np.array([54.74, 7.28, 4.65, 26.18, 4.97, 0.08, 2.11])

# Predictions from the fine-tuned model
fine_tuned = np.array([38.7, 33.65, 27.64, 0.00097, 0.0096, 0.00019, 0.0032])

# Normalize distributions so that they sum to 1; note that the predicted values
# for both models already sum approximately to 100, but we do it explicitly.
p_true = true / true.sum()
p_normal = normal / normal.sum()
p_finetuned = fine_tuned / fine_tuned.sum()

def kl_divergence(p, q):
    """
    Computes KL divergence:  KL(p||q) = sum_i p[i] * log(p[i]/q[i])
    We add a small epsilon to avoid division or log of zero errors.
    """
    epsilon = 1e-12
    p_safe = np.maximum(p, epsilon)
    q_safe = np.maximum(q, epsilon)
    return np.sum(p_safe * np.log(p_safe / q_safe))

def cross_entropy(p, q):
    """
    Computes the cross entropy H(p,q) = -sum_i p[i] * log(q[i]).
    """
    epsilon = 1e-12
    q_safe = np.maximum(q, epsilon)
    return -np.sum(p * np.log(q_safe))

def mse(pred, true):
    """
    Computes the mean squared error between prediction and true values.
    """
    return np.mean((pred - true)**2)

# Compute KL divergence for both models (using the true distribution as p)
kl_normal = kl_divergence(p_true, p_normal)
kl_finetuned = kl_divergence(p_true, p_finetuned)

# Compute cross entropy for both models
ce_normal = cross_entropy(p_true, p_normal)
ce_finetuned = cross_entropy(p_true, p_finetuned)

# Compute MSE for both models using the raw percentages
mse_normal = mse(normal, true)
mse_finetuned = mse(fine_tuned, true)

print("KL Divergence:")
print(f"  Normal Model:     {kl_normal:.4f}")
print(f"  Fine-tuned Model: {kl_finetuned:.4f}\n")

print("Cross Entropy:")
print(f"  Normal Model:     {ce_normal:.4f}")
print(f"  Fine-tuned Model: {ce_finetuned:.4f}\n")

print("Mean Squared Error (MSE):")
print(f"  Normal Model:     {mse_normal:.4f}")
print(f"  Fine-tuned Model: {mse_finetuned:.4f}")

KL Divergence:
  Normal Model:     1.6513
  Fine-tuned Model: 0.5565

Cross Entropy:
  Normal Model:     2.1916
  Fine-tuned Model: 1.0967

Mean Squared Error (MSE):
  Normal Model:     982.6050
  Fine-tuned Model: 436.3768


In [28]:
import numpy as np

# Data in percentages from the table
molecules = ["O2", "N2", "CO2", "H2O", "N2O", "CH4", "H2S"]

# True values (using 0 for H2S as "Not Present")
true = np.array([20.9, 78.1, 0.03795, 0.35004, 0.000032, 0.00017, 0.0])

# Predictions from the normal model
normal = np.array([57.269322872161865, 7.579978555440903, 20.22359073162079, 12.640121579170227, 1.5301313251256943, 0.039025183650664985, 0.717834709212184])

# Predictions from the fine-tuned model
fine_tuned = np.array([52.26215124130249, 8.5650734603405, 22.9727640748024, 13.6198952794075, 01.8039532005786896, 0.04164810525253415, 0.7345139980316162])

# Normalize distributions so that they sum to 1; note that the predicted values
# for both models already sum approximately to 100, but we do it explicitly.
p_true = true / true.sum()
p_normal = normal / normal.sum()
p_finetuned = fine_tuned / fine_tuned.sum()

def kl_divergence(p, q):
    """
    Computes KL divergence:  KL(p||q) = sum_i p[i] * log(p[i]/q[i])
    We add a small epsilon to avoid division or log of zero errors.
    """
    epsilon = 1e-12
    p_safe = np.maximum(p, epsilon)
    q_safe = np.maximum(q, epsilon)
    return np.sum(p_safe * np.log(p_safe / q_safe))

def cross_entropy(p, q):
    """
    Computes the cross entropy H(p,q) = -sum_i p[i] * log(q[i]).
    """
    epsilon = 1e-12
    q_safe = np.maximum(q, epsilon)
    return -np.sum(p * np.log(q_safe))

def mse(pred, true):
    """
    Computes the mean squared error between prediction and true values.
    """
    return np.mean((pred - true)**2)

# Compute KL divergence for both models (using the true distribution as p)
kl_normal = kl_divergence(p_true, p_normal)
kl_finetuned = kl_divergence(p_true, p_finetuned)

# Compute cross entropy for both models
ce_normal = cross_entropy(p_true, p_normal)
ce_finetuned = cross_entropy(p_true, p_finetuned)

# Compute MSE for both models using the raw percentages
mse_normal = mse(normal, true)
mse_finetuned = mse(fine_tuned, true)

print("KL Divergence:")
print(f"  Normal Model:     {kl_normal:.4f}")
print(f"  Fine-tuned Model: {kl_finetuned:.4f}\n")

print("Cross Entropy:")
print(f"  Normal Model:     {ce_normal:.4f}")
print(f"  Fine-tuned Model: {ce_finetuned:.4f}\n")

print("Mean Squared Error (MSE):")
print(f"  Normal Model:     {mse_normal:.4f}")
print(f"  Fine-tuned Model: {mse_finetuned:.4f}")

KL Divergence:
  Normal Model:     1.6120
  Fine-tuned Model: 1.5349

Cross Entropy:
  Normal Model:     2.1522
  Fine-tuned Model: 2.0752

Mean Squared Error (MSE):
  Normal Model:     979.5950
  Fine-tuned Model: 932.0830


In [27]:
temp=[]
for val in [0.5726932287216187, 0.07579978555440903, 0.20223590731620789, 0.12640121579170227, 0.015301313251256943, 0.00039025183650664985, 0.0071783470921218395]:
    temp.append(val*100)
print(temp)


[57.269322872161865, 7.579978555440903, 20.22359073162079, 12.640121579170227, 1.5301313251256943, 0.039025183650664985, 0.717834709212184]
