In [1]:
import pandas as pd
import re
from transformers import BertTokenizer, AutoModelForSequenceClassification, PreTrainedTokenizerFast, LineByLineTextDataset, pipeline, BertConfig, AutoTokenizer, BertModel,BertForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from tokenizers import Tokenizer
from tqdm import tqdm
import evaluate
from sklearn.preprocessing import LabelEncoder
import numpy as np
from datasets import Dataset
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoConfig
import os
import json

from script.tokenizer import KmerTokenizer

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.current_device(), device

(0, device(type='cuda'))

### Import data

In [3]:
train_df = pd.read_csv(r"C:\Users\Auguste Verdier\Desktop\ADNe\BouillaClip\Data\TeleoSplit\train_aug_nodupl_300.csv")
val_df = pd.read_csv(r"C:\Users\Auguste Verdier\Desktop\ADNe\BouillaClip\Data\TeleoSplit\val_genera.csv")
test_df  = pd.read_csv(r"C:\Users\Auguste Verdier\Desktop\ADNe\BouillaClip\Data\TeleoSplit\test_genera.csv")

In [4]:
train_df

Unnamed: 0,sequence,phylum,class,order,family,genus,species
0,CCCCAAACAATAAACACACGAAACTAACTAAAATGCTTCGAACCGT...,Chordata,Actinopteri,Scombriformes,Gempylidae,Promethichthys,Promethichthysprometheus
1,CCCCAAGTTCAATATATCCTTCTAACTAAGAAGTTAGCCGAACAAA...,Chordata,Actinopteri,Salmoniformes,Salmonidae,Oncorhynchus,Oncorhynchusmasou
2,CCCCGAACTTAACCCACGAACCTTACCTAAACTGTTTTACATGAAA...,Chordata,Actinopteri,Holocentriformes,Holocentridae,Neoniphon,Neoniphonargenteus
3,CCCCTGTTAAACAGCAACCAATGTAAATAACACAAAAGCACCAACG...,Chordata,Actinopteri,Cypriniformes,Gastromyzontidae,Yaoshania,Yaoshaniapachychilus
4,CCCCAAGCTTCCGGCCCTAATTAATTAAAACCCTACAACTGCAAAG...,Chordata,Actinopteri,Istiophoriformes,Istiophoridae,Istiophorus,Istiophorusplatypterus
...,...,...,...,...,...,...,...
91010,CCCCAAACCATTAGAATAAGTAATTAGACCTCCATACGACAAAGGG...,Chordata,Actinopteri,Lophiiformes,Diceratiidae,Synthetic,Diceratiaspileatus
91011,CACCAAACCAATAGAATGAGTATTTTAACGCCGTTAGGACAAAGGG...,Chordata,Actinopteri,Lophiiformes,Diceratiidae,Synthetic,Diceratiaspileatus
91012,CCCCAAATCATTAGACTTGGGAATTAAATCATCAGAATACAAAGGG...,Chordata,Actinopteri,Lophiiformes,Diceratiidae,Synthetic,Diceratiaspileatus
91013,CCCCACACCAATAAACTACGTCATTAAACCACCAGGAAACAAAGGG...,Chordata,Actinopteri,Lophiiformes,Diceratiidae,Synthetic,Diceratiaspileatus


## Configure the model

In [5]:
model_path_save=r"C:\Users\Auguste Verdier\Desktop\ADNe\BouillaClip\Model\genera_300_medium_3_mer\checkpoint-85335"

In [6]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit the label encoder on the combined data
label_encoder.fit(pd.concat([train_df['family'], val_df['family']]))

# Encode the labels
train_labels = label_encoder.transform(train_df['family'])
val_labels = label_encoder.transform(val_df['family'])

In [7]:
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}

## TOKENIZER
tokenizer=KmerTokenizer(3, trust_remote_code=True,add_special_tokens=False)
#tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(model_path_save, trust_remote_code=True)

if not tokenizer.pad_token:
    print("add one")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [20]:
seq="GCCCCCGTGAGTAATGCCCTGACAGTTTTATATCCTAAAACGAGGA"
print(tokenizer.tokenize(seq))

['GCC', 'CCC', 'CCC', 'CCC', 'CCG', 'CGT', 'GTG', 'TGA', 'GAG', 'AGT', 'GTA', 'TAA', 'AAT', 'ATG', 'TGC', 'GCC', 'CCC', 'CCT', 'CTG', 'TGA', 'GAC', 'ACA', 'CAG', 'AGT', 'GTT', 'TTT', 'TTT', 'TTA', 'TAT', 'ATA', 'TAT', 'ATC', 'TCC', 'CCT', 'CTA', 'TAA', 'AAA', 'AAA', 'AAC', 'ACG', 'CGA', 'GAG', 'AGG', 'GGA']


In [18]:
config = BertConfig.from_pretrained(os.path.join(model_path_save,"config.json"), 
                                    num_labels=len(id2label), 
                                    max_position_embeddings=510,
                                    id2label=id2label,
                                    label2id=label2id
)

model = AutoModelForSequenceClassification.from_pretrained(model_path_save, trust_remote_code=True, ignore_mismatched_sizes=True, config=config).to(device)

model.config.id2label = id2label
model.config.label2id = label2id



In [19]:
train_encodings = tokenizer(train_df['sequence'].tolist(), truncation=True, max_length=510)
val_encodings = tokenizer(val_df['sequence'].tolist(), truncation=True, max_length=510)

train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Test the model

In [23]:
text = "CCCCTGTCAAAATGCAATAAAGATATTTAATACCAAAGCGCTGACAAGGGGAGGCAAGTCGTAA" # Leuciscidae
classifier = pipeline("sentiment-analysis", model=trainer.model, tokenizer=tokenizer,device=0)
results = classifier(text)

labels = [result['label'] for result in results]
scores = [result['score'] for result in results]

label_scores = dict(zip(labels, scores))
label_scores

torch.Size([1, 64, 768]) torch.Size([1, 64, 768])
tensor([[[ 0.0287,  0.0005,  0.1000,  ...,  0.0044, -0.0054, -0.0020],
         [ 0.0287,  0.0005,  0.1000,  ...,  0.0044, -0.0054, -0.0020],
         [ 0.0287,  0.0005,  0.1000,  ...,  0.0044, -0.0054, -0.0020],
         ...,
         [ 0.0287,  0.0005,  0.1000,  ...,  0.0044, -0.0054, -0.0020],
         [ 0.0287,  0.0005,  0.1000,  ...,  0.0044, -0.0054, -0.0020],
         [ 0.0287,  0.0005,  0.1000,  ...,  0.0044, -0.0054, -0.0020]]],
       device='cuda:0')
tensor([[[-0.0423, -0.0375,  0.0128,  ..., -0.0286, -0.0231, -0.0274],
         [-0.0106, -0.0621, -0.0159,  ...,  0.0545,  0.0286, -0.0168],
         [-0.0106, -0.0621, -0.0159,  ...,  0.0545,  0.0286, -0.0168],
         ...,
         [ 0.0496,  0.0425, -0.0914,  ...,  0.0228, -0.0616, -0.0226],
         [-0.0428, -0.0500, -0.0098,  ..., -0.0225,  0.0258,  0.0499],
         [-0.0269, -0.0381, -0.0309,  ..., -0.0306, -0.0243, -0.0291]]],
       device='cuda:0')


{'Leuciscidae': 0.7230259776115417}

In [26]:
def calculate_predictions(file_path):

    
    # Initialize lists to store the data
    predictions = []
    labels = []

    dict_score=dict()



    testfile = pd.read_csv(file_path)

    #sp=read_list('test_1_spe.json')
    sp=testfile['species'].unique()
    # for f in testfile['family'].unique():
    #     dict_score[f]=(0,0)

    

    for index, row in tqdm(testfile.iterrows(),total=testfile.shape[0]):
        #print(row['sequence'])
        lab=row['family']
        

        if row['species'] in sp:
        
            
            if lab not in dict_score :
                dict_score[lab]=[0,0]
            
            print(row['sequence'])
            #se="CCCCTGTCAAAATGCAATAAAGATATTTAATACCAAAGCGCTGACAAGGGGAGGCAAGTCGTAA"
            pred = classifier(row['sequence'])[0]['label']
            # pred=classifier(se)[0]['label'] 
            #print(pred,lab,1*(lab==pred))


            predictions.append(pred)

            labels.append(row['family'])

            succes,total=dict_score[lab]


            dict_score[lab]=succes+1*(lab==pred),total+1
        #print(pred,lab,pred==lab)


    # Calculate accuracy
    #accuracy = np.mean(np.array(predictions) == np.array(labels)) * 100

    #labs = set(predictions + labels)
    
    # Create a confusion matrix
    #cm = confusion_matrix(labels, predictions, labs)
    
    return labels, predictions, dict_score

# Call the function with the file path
labels, predictions, dict_score = calculate_predictions(r"C:\Users\Auguste Verdier\Desktop\ADNe\DATASET\LAST 12S\train.csv")

  0%|          | 0/92463 [00:00<?, ?it/s]

GCCCCCGTGAGTAATGCCCTGACAGTTTTATATCCTAAAACGAGGAGCTGGCATCAGGCACAACCCCCCGTTAGCCCACGACGCCTCGCTTAGCCACACCCCCAAGGGAATTTCAGCAGTGATAAACCTTAAGCCATAAGTGAAAACTAGACTTAGTAACAGCTAATAAGGGCTGGTAAAACTCGTGCCAGCCGCCGCGGTTATACGAGTGGCCCAAGTTGATAAAAACCGGCGTAAAGCGTGGTTAAGGTCATACTACAAACTAAAGCCGAACCTCCTCACAGCAGTTATACGCTTATGAAGAAACTGAAGCTCCCCCACGAAAGTGGCTTTACTACCCCACCTGACCCCACGAAAGCTATGGCCCAAACTGGGATTAGAGACCCCACTATGCATAGCTGTAAACCCTGACAGATTTTTACATCCCCTGTCCGCCCGGGTACTACGCGCGTCAGCGTAAAACCCAAAGGACTTGGCGGTTCTTTAGACCCCCTAGAGGAGCCTGTTCTATAACCGATAATCCCCGTTAAACCTCACCCTCTCTTGCCTATCCCGCCTATATACCGCCGTCGTCAGCTACCCCTGTGAAGGATGAACAGCTAGCAAGATTGGTACCACCCAAAACGTCAGGTCCAGGTGTAGCGTATGAGAGGGGCAGAAATGGGCTACATTCGCTAATTTAGCGAACACGAACGATGTACTGAAAAAATACATCCGAAGGAGGATTTAGCAGTAAGTAGGAAGCAGAGCGTCCCACTGAAGCCGGCTCTGAAGTGCGTACACACCGCCCGTCACTTTCCCCAAACAATAAACACACGAAACTAACTAAAATGCTTCGAACCGTTAAGGGGAAACAAGTCGTAACATGGTAAGTGTACCGGAAGGTGTACTTGGCAATATCC
torch.Size([1, 904, 768]) torch.Size([1, 902, 768])
tensor([[[ 0.0287,  0.0005,  0.1000,  ...,  0




RuntimeError: The size of tensor a (904) must match the size of tensor b (902) at non-singleton dimension 1

In [53]:
np.mean(np.array(predictions) == np.array(labels)) * 100

1.6323357947920714

In [54]:
print(len(predictions))

2573


In [35]:
dict_score

{'Carangidae': (7, 9),
 'Cyprinidae': (38, 39),
 'Salmonidae': (6, 6),
 'Nemipteridae': (2, 2),
 'Rajidae': (6, 6),
 'Carcharhinidae': (10, 10),
 'Blenniidae': (2, 7),
 'Cyclopsettidae': (1, 1),
 'Bovichtidae': (1, 1),
 'Botiidae': (5, 6),
 'Gobiidae': (44, 55),
 'Serranidae': (11, 14),
 'Odontobutidae': (1, 3),
 'Esocidae': (2, 2),
 'Apogonidae': (18, 20),
 'Nemacheilidae': (13, 13),
 'Anguillidae': (4, 4),
 'Syngnathidae': (5, 7),
 'Acheilognathidae': (6, 8),
 'Retropinnidae': (1, 1),
 'Pseudochromidae': (0, 1),
 'Lotidae': (1, 1),
 'Aphaniidae': (4, 4),
 'Leuciscidae': (40, 42),
 'Cottidae': (18, 18),
 'Macrouridae': (8, 8),
 'Scombridae': (5, 6),
 'Badidae': (0, 1),
 'Gobionidae': (17, 19),
 'Liparidae': (3, 3),
 'Etmopteridae': (5, 5),
 'Rivulidae': (8, 9),
 'Acipenseridae': (6, 6),
 'Zoarcidae': (3, 3),
 'Aulorhynchidae': (0, 1),
 'Gastromyzontidae': (7, 7),
 'Synanceiidae': (0, 1),
 'Xenocyprididae': (9, 10),
 'Labridae': (17, 23),
 'Zenionidae': (1, 1),
 'Grammicolepididae': (2

In [18]:
#np.mean(np.array(predictions_train) == np.array(labels_train)) * 100

In [55]:
count_val=test_df['family'].value_counts()
family=count_val.keys()

macro_no_augment=(0,0)
macro_augment=(0,0)

micro_acc_no_augment=[]
micro_acc_augment=[]

for f in family:

    succes,total=dict_score[f]

    # if count_val[f] > 97:

        

    #     macro_no_augment=macro_no_augment[0]+succes,macro_no_augment[1]+total
    #     micro_acc_no_augment.append(succes/total)

    # else :
    #     macro_augment=macro_augment[0]+succes,macro_augment[1]+total
    #     micro_acc_augment.append(succes/total)

    
    macro_augment=macro_augment[0]+succes,macro_augment[1]+total
    micro_acc_augment.append(succes/total)

    
    

# print(f'Macro accuracy sur famille non augmenté ( {len(micro_acc_no_augment)} familles ) : {100*macro_no_augment[0]/macro_no_augment[1]}')
# print(f'Micro accuracy sur famille non augmenté ( {len(micro_acc_no_augment)} familles ) : {100*np.mean(micro_acc_no_augment)}')


print(f'Macro accuracy sur famille augmenté ( {len(micro_acc_augment)} familles ) : {100*macro_augment[0]/macro_augment[1]}')
print(f'Micro accuracy sur famille augmenté ( {len(micro_acc_augment)} familles ) : {100*np.mean(micro_acc_augment)}')




Macro accuracy sur famille augmenté ( 302 familles ) : 1.6323357947920716
Micro accuracy sur famille augmenté ( 302 familles ) : 0.46156933574152115


In [20]:
count_val=val_df['family'].value_counts()
family=count_val.keys()

macro_no_augment=(0,0)
macro_augment=(0,0)

micro_acc_no_augment=[]
micro_acc_augment=[]

for f in family:

    succes,total=dict_score_train[f]

    if count_val[f] > 97:

        

        macro_no_augment=macro_no_augment[0]+succes,macro_no_augment[1]+total
        micro_acc_no_augment.append(succes/total)

    else :
        macro_augment=macro_augment[0]+succes,macro_augment[1]+total
        micro_acc_augment.append(succes/total)

print(f'Macro accuracy sur famille non augmenté ( {len(micro_acc_no_augment)} familles ) : {100*macro_no_augment[0]/macro_no_augment[1]}')
print(f'Micro accuracy sur famille non augmenté ( {len(micro_acc_no_augment)} familles ) : {100*np.mean(micro_acc_no_augment)}')


print(f'Macro accuracy sur famille augmenté ( {len(micro_acc_augment)} familles ) : {100*macro_augment[0]/macro_augment[1]}')
print(f'Micro accuracy sur famille augmenté ( {len(micro_acc_augment)} familles ) : {100*np.mean(micro_acc_augment)}')

NameError: name 'dict_score_train' is not defined

In [None]:

count_val=val_df['family'].value_counts()
family=count_val.keys()

macro=(0,0)

micro=[]

for f in family:

    succes,total=dict_score[f]

    

        

    macro=macro[0]+succes,macro[1]+total
    micro.append(succes/total)

    

    

print(f'accuracy  : {100*macro[0]/macro[1]}')
print(f'accurcy moyenne sur famille ( {len(micro)} familles ) : {100*np.mean(micro)}')







accuracy  : 95.5397442326981
accurcy moyenne sur famille ( 1283 familles ) : 95.45409594360271


In [None]:
labels_train,predictions_train, dict_score_train = calculate_predictions(r"C:\Users\Auguste Verdier\Desktop\ADNe\BerTeleo\data\Data12S\FinetuningData\RawData\full_dataset_teleo.csv")

100%|██████████| 114567/114567 [24:08<00:00, 79.10it/s]


In [None]:

count_val=val_df['family'].value_counts()
family=count_val.keys()

macro=(0,0)

micro=[]

for f in family:

    succes,total=dict_score_train[f]

    

        

    macro=macro[0]+succes,macro[1]+total
    micro.append(succes/total)

    

    

print(f'accuracy  : {100*macro[0]/macro[1]}')
print(f'accurcy moyenne sur famille ( {len(micro)} familles ) : {100*np.mean(micro)}')

accuracy  : 2.2320875855604094
accurcy moyenne sur famille ( 1283 familles ) : 4.0808801283304215
