# Complete workflow using Decision Tree Prediction

In [1]:
import os
import pandas as pd
import math 
from collections import defaultdict 
import matplotlib.pyplot as plt
import numpy as np
import pickle
import sentencepiece as spm
import json

DATA_DIR="../data"

In [2]:
# Access &read the file - Currated
file_path = os.path.join(DATA_DIR, "SPARCLE_IDS_curated_simplified.csv")
df_cur = pd.read_csv(file_path)

In [3]:
# Access &read the file - Currated
file_path = os.path.join(DATA_DIR, "SPARCLE_IDS_UNcurated_TITLES.csv.gz")
df_UNcur = pd.read_csv(file_path, compression='gzip')

## Preprocessing

### Keep essential columns 

In [4]:
df_cur = df_cur[['ArchId', 'CurName_simplified', 'superfamilyarch', 'SpecificArch', 'TitleStrings']]
df_UNcur = df_UNcur[['ArchId', 'CurName', 'superfamilyarch', 'SpecificArch', 'TitleStrings']]

### Lowercase

In [5]:
df_cur.loc[:, ['superfamilyarch', 'SpecificArch', 'TitleStrings']] = df_cur.loc[:, ['CurName_simplified', 'superfamilyarch', 'SpecificArch', 'TitleStrings']].apply(lambda x: x.str.lower())
df_UNcur.loc[:, ['superfamilyarch', 'SpecificArch', 'TitleStrings']] = df_UNcur.loc[:, ['CurName', 'superfamilyarch', 'SpecificArch', 'TitleStrings']].apply(lambda x: x.str.lower())

### Filter specific words in TitleStrings

Integration of data obtained from file "CM_XXX"

In [6]:
words_to_remove = ['domain', 'domains', ' of ', ' and ', ' the ', ' a ', 'found', 'type', 'protein', 'proteins', 'N/A', 'n/a']

In [7]:
# Creating a regex pattern to match any word in the list of words to remove
pattern = '|'.join(r"\b{}\b".format(word) for word in words_to_remove)

# Replacing the words in the 'TitleStrings' column with an empty string for df_cur
df_cur['TitleStrings'] = df_cur['TitleStrings'].str.replace(pattern, ' ', regex=True)
df_cur['TitleStrings'] = df_cur['TitleStrings'].str.replace('  +', ' ', regex=True)  # Consolidates multiple spaces into one
df_cur['TitleStrings'] = df_cur['TitleStrings'].str.replace(' , ', ', ', regex=True)

# Repeating the process for df_UNcur
df_UNcur['TitleStrings'] = df_UNcur['TitleStrings'].str.replace(pattern, ' ', regex=True)
df_UNcur['TitleStrings'] = df_UNcur['TitleStrings'].str.replace('  +', ' ', regex=True)  # Consolidates multiple spaces into one
df_UNcur['TitleStrings'] = df_UNcur['TitleStrings'].str.replace(' , ', ', ', regex=True)
df_UNcur = df_UNcur[:-1]

#### Test for overrepresented words in TitleStrings

In [8]:
from collections import Counter

In [9]:
split_TitleStrings = df_cur['TitleStrings'].str.split('|').explode()

frequency_df = pd.DataFrame(Counter(split_TitleStrings).items(), columns=['String', 'Frequency'])

print(frequency_df.sort_values(by='String').reset_index(drop=True)[19:21]['String'])
frequency_df.sort_values(by='String').reset_index(drop=True)[19:21]

19     -l-isoaspartate(d-aspartate) o-methyltransferase
20     -l-isoaspartate(d-aspartate) o-methyltransfer...
Name: String, dtype: object


Unnamed: 0,String,Frequency
19,-l-isoaspartate(d-aspartate) o-methyltransferase,3
20,-l-isoaspartate(d-aspartate) o-methyltransfer...,1


In [10]:
#removal of spaces completely to avoid this? 

In [11]:
import collections

In [12]:
strings = list(set(df_cur.loc[~df_cur['TitleStrings'].isna()]['TitleStrings']))
all_strings = []
for string in strings:
    # string = strings[0]
    string = string.replace("| ","")
    string = string.replace(", ","")
    words = string.split()
    all_strings += words


counter = collections.Counter(all_strings)
counter.most_common(20)

[('family', 3714),
 ('in', 3713),
 ('function', 3251),
 ('unknown', 2646),
 ('similar', 2383),
 ('the', 2005),
 ('and', 1629),
 ('transport', 1578),
 ('kinase', 1382),
 ('1', 1272),
 ('catalytic', 1109),
 ('subunit', 1036),
 ('metabolism]', 1022),
 ('synthase', 1017),
 ('binding', 975),
 ('2', 968),
 ('uncharacterized', 945),
 ('motif', 945),
 ('superfamily', 941),
 ('rna', 901)]

### Save Preprocessed Dataframes

In [13]:
df_cur.to_csv('../data/SPARCLE_IDS_curated_simplified_modTitleStrings.csv', index=False)
df_UNcur.to_csv('../data/SPARCLE_IDS_UNcurated_TITLES_modTitleStrings.csv', index=False)

### Extract Unique SpecificArch to Dictionary

In [14]:
# Filter rows where SpecArchs has exactly one string & Find unique SpecArchs values
df_filtered = df_cur[df_cur['SpecificArch'].map(lambda x: len(str(x).split()) == 1)]
unique_specarchs = df_filtered['SpecificArch'].drop_duplicates(keep=False)

In [15]:
# Create dictionary with SpecArchs as keys and [superfam, CurLabel] as values
result_dict = {}
for specarch in unique_specarchs:
    row = df_filtered[df_filtered['SpecificArch'] == specarch]
    result_dict[specarch] = [row['superfamilyarch'].iloc[0], row['CurName_simplified'].iloc[0]]

In [16]:
len(result_dict.keys())

17417

In [17]:
with open('../DTresults/uniqueSpecificArchFromCur_dict.json', 'w') as json_file:
    json.dump(result_dict, json_file)

In [18]:
with open('../DTresults/uniqueSpecificArchFromCur_dict.json', 'r') as json_file:
    result_dict = json.load(json_file)

## Removal of structures with 0 known architectures/superfam

### Handling duplicated ArchIds

In [19]:
import string

In [20]:
df_UNcur.shape

(178285, 5)

In [21]:
df_cur['ArchId'] = df_cur['ArchId'].fillna(0).astype(int)
df_UNcur['ArchId'] = df_UNcur['ArchId'].fillna(0).astype(int)

In [22]:
# Function to add suffixes to duplicates
def add_suffix_to_duplicates(series):
    # Count the occurrences of each value
    counts = series.value_counts()
    # Filter to get duplicates only
    duplicates = counts[counts > 1].index.tolist()
    
    # Initialize an empty column for adjusted values
    adjusted = pd.Series(index=series.index, dtype="object")
    for value in duplicates:
        # Find the indices for duplicates
        indices = series[series == value].index
        # Generate suffixes ('a', 'b', 'c', ...)
        suffixes = [chr(97 + i) for i in range(len(indices))]
        # Assign adjusted values with suffixes
        adjusted.loc[indices] = [f"{value}{suffix}" for suffix in suffixes]
    
    # Fill in unique values without changes
    adjusted = adjusted.fillna(series.astype(str))
    return adjusted

df_UNcur['ArchId_adj'] = add_suffix_to_duplicates(df_UNcur['ArchId'])

In [23]:
value_counts = df_UNcur['ArchId_adj'].value_counts()
# Filter the counts to only those values that appear more than once =duplicates
value_counts[value_counts > 1]

Series([], Name: count, dtype: int64)

### Remove unknown structures from prediction set

In [24]:
df_UNcur.shape

(178285, 6)

In [25]:
allArchs = set()  # Use a set for faster lookup
df_cur['SpecificArch'].dropna().apply(lambda x: allArchs.update(x.split(' ')))
print(len(allArchs))
df_cur['superfamilyarch'].dropna().apply(lambda x: allArchs.update(x.split(' ')))
len(allArchs)

28054


39484

In [26]:
structuresWOanyknownArchs = []

for _, row in df_UNcur.iterrows():
    specific_arch_list = str(row['SpecificArch']).split(' ') 
    specific_superfam_list = str(row['superfamilyarch']).split(' ') 
    specific_arch_list.extend(specific_superfam_list)
    # Check if at least one of the strings is in allArchs
    if not any(arch in allArchs for arch in specific_arch_list):
        # If no match is found, add the ArchId to unknownArchs
        structuresWOanyknownArchs.append(row['ArchId_adj'])
len(structuresWOanyknownArchs)

19854

#### Export Unknown

In [27]:
# Export list to a text file
with open('../DTresults/ArchIds_adj_withUntrainedSpecificArchSuperfamilyarch.txt', 'w') as file:
    for ArchId_adj in structuresWOanyknownArchs:
        file.write(f"{ArchId_adj}\n")

In [28]:
#Export rows with ArchId_adj in structuresWOanyknownArchs to a CSV file
df_notTrained = df_UNcur[df_UNcur['ArchId_adj'].isin(structuresWOanyknownArchs)]
df_notTrained.to_csv('../DTresults/ArchIds_adj_withUntrainedSpecificArchSuperfamilyarch.csv', index=False)
df_notTrained.shape

(19854, 6)

In [29]:
#Export rows with ArchId_adj in structuresWOanyknownArchs to a CSV file
df_UNcur2 = df_UNcur[-df_UNcur['ArchId_adj'].isin(structuresWOanyknownArchs)]
df_UNcur2.shape

(158431, 6)

## Prediction of Unique Uncurated Structures

In [30]:
df_filtered = df_UNcur[df_UNcur['SpecificArch'].map(lambda x: len(str(x).split()) == 1)]

unique_specarchs_UNcur = df_filtered['SpecificArch'].drop_duplicates(keep=False)

In [31]:
unique_specarchs_UNcur

20110      chl00027
20118      chl00041
20125      chl00045
20343       cog0161
20378       cog0211
            ...    
178198    pfam19904
178201    pfam19956
178206    pfam20024
178222    pfam20149
178226    pfam20155
Name: SpecificArch, Length: 20931, dtype: object

In [32]:
keys_unique = [key for key in unique_specarchs_UNcur if key in result_dict]

In [33]:
UNcur_pred1 = []
for key in keys_unique:
    # Find the row in df_UNcur corresponding to the current key
    row = df_UNcur[df_UNcur['SpecificArch'] == key]
    
    # Ensure there's exactly one matching row and the key exists in result_dict
    #if not row.empty and key in result_dict:
    superfamily_un_cur = row['superfamilyarch'].iloc[0]  # The superfamily value from df_UNcur
    # Compare with the value in result_dict
    UNcur_pred1.append({'ArchId': row['ArchId'].iloc[0], 'curName-pred': result_dict[key][1], 'type-pred': 'HashMap-prediction'})

In [34]:
df_UNcur_unique_pred = pd.DataFrame(UNcur_pred1)

In [35]:
df_UNcur_unique_pred

Unnamed: 0,ArchId,curName-pred,type-pred
0,10004105,glycoside hydrolase family protein,HashMap-prediction
1,11479260,FAD-dependent thymidylate synthase,HashMap-prediction
2,11484309,cation-efflux pump,HashMap-prediction
3,11484760,alkaline phosphatase,HashMap-prediction
4,11484649,metal-binding protein,HashMap-prediction
5,11492224,"2,3-bisphosphoglycerate-independent phosphogly...",HashMap-prediction
6,11493850,FAD-dependent thymidylate synthase,HashMap-prediction
7,11573863,VOC family protein,HashMap-prediction
8,12026142,transposase,HashMap-prediction
9,11996742,fatty acid desaturase,HashMap-prediction


In [36]:
UNcur_predFut = df_UNcur_unique_pred.to_dict('records')

## Label Encoding

Integrate generate_input_and_output_matrix_alternative.ipynb (MY) 

## Initial Testing Decision Tree

In [39]:
with open(os.path.join(DATA_DIR, 'input_output_list.pickle'), 'rb') as f:
    a = pickle.load(f)

In [40]:
X_train = np.array([d[0] for d in a])
y_train = np.array(df_cur['CurName_simplified'])[:-1]

In [41]:
from sklearn.model_selection import train_test_split
X = np.array([d[0] for d in a])
y = np.array(df_cur['CurName_simplified'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (34212, 450)
Testing set size: (8554, 450)


### Decision Tree Classifier

In [42]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

In [43]:
# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=30)

In [44]:
# Fit the model on the training data
clf.fit(X_train, y_train)

In [45]:
# Make predictions and evaluate the model
predictions = clf.predict(X_test)

In [46]:
print(clf.tree_.node_count)
print(clf.get_n_leaves())
print(clf.get_depth())

36139
18070
135


### Comparison of real output and prediction - decoded

In [47]:
sp = spm.SentencePieceProcessor(model_file='curnames.model')

In [48]:
y_test_decoded = []
for row in y_test:
    # Decode each token in the row, skipping 0 and -1, and concatenate them
    decoded_row = ''.join(sp.decode(token) for token in row if token not in [0, -1])
    # Append the decoded and concatenated string to the list
    y_test_decoded.append(decoded_row)
y_test_decoded = np.array(y_test_decoded)

In [49]:
predictions_decoded = []
for row in predictions:
    # Decode each token in the row, skipping 0 and -1, and concatenate them
    decoded_row = ''.join(sp.decode(token) for token in row if token not in [0, -1])
    # Append the decoded and concatenated string to the list
    predictions_decoded.append(decoded_row)
predictions_decoded = np.array(predictions_decoded)

In [50]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_decoded, predictions_decoded)

print("Accuracy:", accuracy)

Accuracy: 0.3241758241758242


In [51]:
comparison_df = pd.DataFrame({'Actual': y_test_decoded, 'Predicted': predictions_decoded, 'Equal': predictions_decoded==y_test_decoded})#.sort_values(by='Equal')
#print(len(comparison_df), len(comparison_df[comparison_df["Equal"] == True]))
comparison_df.head(25)

Unnamed: 0,Actual,Predicted,Equal
0,transposase,transposase,True
1,YccV-like domain-containing F-box protein,YccV-like domain-containing F-box protein,True
2,FAD-binding and (Fe-S)-binding domain-containi...,FAD-binding and (Fe-S)-binding domain-containi...,True
3,23S rRNA uridine -methyltransferase,23S rRNA guanine methyltransferase,False
4,non-ribosomal peptide synthetase,long-chain fatty acid--CoA ligase,False
5,translation initiation factor IF-2,translation initiation factor IF-2,True
6,LacI family DNA-binding transcriptional regulator,LacI family DNA-binding transcriptional regulator,True
7,potassium-transporting ATPase subunit,type IV pilus assembly protein,False
8,signal recognition particle protein,signal recognition particle protein,True
9,heterocycloanthracin/sonorensin family bacteri...,LamB/YcsF family protein,False


### Fuzzy Matching to evaluate results

In [52]:
import sys, os, json
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
from pathlib import Path
import Levenshtein

In [53]:
# Define a threshold for considering a prediction as correct
similarity_threshold = 80

def calculate_similarity_scores(predictions, actuals, threshold=similarity_threshold):
    correct_predictions = 0
    total_predictions = len(predictions)
    
    # Scores for different types of comparisons
    similarity_scores = []
    partial_scores = []
    token_sort_scores = []
    token_set_scores = []
    fuzzyOut = []
    
    for predicted, actual in zip(predictions, actuals):
        # Convert to lower case for case-insensitive comparison
        predicted_lower = predicted.lower()
        actual_lower = actual.lower()
        
        # Calculate similarity score and other types of scores
        similarity = fuzz.ratio(predicted_lower, actual_lower)
        partial = fuzz.partial_ratio(predicted_lower, actual_lower)
        token_sort = fuzz.token_sort_ratio(predicted_lower, actual_lower)
        token_set = fuzz.token_set_ratio(predicted_lower, actual_lower)
        
        # Record the scores
        similarity_scores.append(similarity)
        partial_scores.append(partial)
        token_sort_scores.append(token_sort)
        token_set_scores.append(token_set)
        
        # Check if the similarity score is above the threshold
        if (similarity >= threshold): #& (similarity < 98):
            correct_predictions += 1
            #print(actual, predicted)
            #fuzzyOut.append({'Actual': actual, 'Predicted': predicted})

    #df_fuzzy = pd.DataFrame(fuzzyOut)
    
    # Calculate adjusted accuracy
    adjusted_accuracy = correct_predictions / total_predictions
    
    return adjusted_accuracy, similarity_scores, partial_scores, token_sort_scores, token_set_scores#, df_fuzzy

In [54]:
print(len(comparison_df[comparison_df["Equal"] == True])/len(comparison_df)*100)

32.417582417582416


In [55]:
# Calculate scores and adjusted accuracy
adjusted_accuracy, similarity_scores, partial_scores, token_sort_scores, token_set_scores = calculate_similarity_scores(predictions, y_test)

print(f"Adjusted Accuracy: {adjusted_accuracy*100:.2f}%")
#print("Similarity Scores:", similarity_scores)
#print("Partial Match Scores:", partial_scores)
#print("Token Sort Scores:", token_sort_scores)
#print("Token Set Scores:", token_set_scores)
#fuzzy
#fuzzy.tail(14)#[~fuzzy.map(lambda x: 'DUF' in str(x)).any(axis=1)].tail(22)


Adjusted Accuracy: 38.86%


## Training Decision Tree on all curated Structures

In [37]:
with open(os.path.join(DATA_DIR, 'input_output_list.pickle'), 'rb') as f:
    pk_cur = pickle.load(f)

In [38]:
X_train = np.array([d[0] for d in pk_cur])
y_train = np.array(df_cur['CurName_simplified'])#[:-1]

In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

In [40]:
# Initialize the Decision Tree Classifier
clfFull = DecisionTreeClassifier(random_state=30)

In [41]:
# Fit the model on the training data
clfFull.fit(X_train, y_train)

## Prediction of Uncurated

### Load dict with decoded uncurated structure

In [42]:
import gzip

In [43]:
#would be good to re-do with ArchId_adj
with gzip.open(os.path.join(DATA_DIR, 'encoded_features_uncurated.pickle.gz'), 'rb') as f:
    UNcur_dict = pickle.load(f)

In [44]:
len(UNcur_dict)

173287

In [45]:
#remove those Ids that have been previously predicted already! (Unique pred)
values_pred =  df_UNcur_unique_pred['ArchId'].apply(lambda x: str(int(x)) if pd.notnull(x) else x).tolist()
values_pred

for value in values_pred:
    UNcur_dict.pop(value, None) 

In [46]:
len(UNcur_dict)

173274

In [47]:
#remove those Ids that do not have a single arch/superfam training was done with
for value in structuresWOanyknownArchs:
    UNcur_dict.pop(value, None) 

In [48]:
len(UNcur_dict)

153793

In [49]:
#remove duplications (just keep those with ArchId_adj that are equal to ArchId in UNcur_dict)
nonUnique_archid_adj = set(df_UNcur['ArchId_adj'])
# Filter the dictionary to only include keys that are present in 'ArchId_adj'(filtered)
filtered_dict = {key: value for key, value in UNcur_dict.items() if key in nonUnique_archid_adj}

In [50]:
len(filtered_dict)

148988

### Predict and de-code

In [51]:
sp = spm.SentencePieceProcessor(model_file='curnames.model')

In [52]:
UNcur_pred2 = UNcur_pred1
for key, value in filtered_dict.items():
    #some len(UNcur_dict['234922']) == 462
    if (len(UNcur_dict[str(key)]) == 450):
        pred_wArchId = clfFull.predict(np.array([UNcur_dict[str(key)]]))
        decoded_pred = ''.join(sp.decode(token) for token in pred_wArchId[0] if token not in [0, -1])
        # Append the key and prediction to the DataFrame
        UNcur_pred2.append({'ArchId': key, 'curName-pred': decoded_pred, 'type-pred': 'DT-pred'})

In [53]:
df_UNcur_pred2 = pd.DataFrame(UNcur_pred2)

In [54]:
df_UNcur_pred2

Unnamed: 0,ArchId,curName-pred,type-pred
0,10004105,glycoside hydrolase family protein,HashMap-prediction
1,11479260,FAD-dependent thymidylate synthase,HashMap-prediction
2,11484309,cation-efflux pump,HashMap-prediction
3,11484760,alkaline phosphatase,HashMap-prediction
4,11484649,metal-binding protein,HashMap-prediction
...,...,...,...
148977,18385239,GATA-type transcription factor,DT-pred
148978,18386111,nitrogen regulation protein NR(II),DT-pred
148979,18386208,importin-beta N-terminal domain-containing pro...,DT-pred
148980,18386403,glycosyltransferase,DT-pred


### Merge with df_UNcur_pred, structuresWOanyknownArchs

In [55]:
df_UNcur['ArchId'] = df_UNcur['ArchId'].astype(str)
df_UNcur_pred2['ArchId'] = df_UNcur_pred2['ArchId'].astype(str)

In [56]:
df_UNcur_predFull = pd.merge(df_UNcur, df_UNcur_pred2, on='ArchId', how='left')
# Replace NaN values in 'type-pred' column with 'not predicted'
df_UNcur_predFull['type-pred'] = df_UNcur_predFull['type-pred'].fillna('not predicted')

In [57]:
df_UNcur_predFull

Unnamed: 0,ArchId,CurName,superfamilyarch,SpecificArch,TitleStrings,ArchId_adj,curName-pred,type-pred
0,48,Gag_p24 domain-containing protein,gag_p24,,,48,NADH dehydrogenase subunit,DT-pred
1,39,Gag_p17 domain-containing protein,gag_p17,,,39,NADH dehydrogenase subunit,DT-pred
2,28,Ribosomal_S18 domain-containing protein,ribosomal_s18,,,28,NADH dehydrogenase subunit,DT-pred
3,52,VPR domain-containing protein,vpr,,,52,NADH dehydrogenase subunit,DT-pred
4,35,TOPRIM domain-containing protein,toprim,,,35,NADH dehydrogenase subunit,DT-pred
...,...,...,...,...,...,...,...,...
178280,18385239,zinc-finger associated domain-containing protein,flywch zf-ad,smart00868 cl04548 pfam04500 cl04548,zinc-finger associated (zf-ad) | flywch | flyw...,18385239,GATA-type transcription factor,DT-pred
178281,18386111,Haemagg_act domain-containing protein,haemagg_act duf2345,smart00912 cl44593,haemagglutination activity,18386111,nitrogen regulation protein NR(II),DT-pred
178282,18386208,importin-beta N-terminal domain-containing pro...,ibn_n exportin-5,smart00913 cl44708,importin-beta n-terminal,18386208,importin-beta N-terminal domain-containing pro...,DT-pred
178283,18386403,S53 family peptidase; S8/S53 family peptidase,pro-peptidase_s53 peptidases_s8_s53 choice_anch_j,smart00944 cd04056 cl45621,"pro-kumamolisin, activation | peptidase in s53...",18386403,glycosyltransferase,DT-pred


In [58]:
#export
df_UNcur_predFull.to_csv('../DTresults/SPARCLE_IDS_UNcurated_DTprediction.csv', index=False)

In [59]:
#test by hand - prediction of specific 
clfFull.predict(np.array([UNcur_dict[str(10019828)]]))

array(['serine dehydratase subunit alpha family protein'], dtype=object)

## Training Decision Tree on all curated Structures - SpecificArch and Superfamily only

In [60]:
with open(os.path.join(DATA_DIR, 'input_output_list.pickle'), 'rb') as f:
    pk_cur = pickle.load(f)

In [61]:
X_train = np.array([d[0] for d in pk_cur])[:, :-400]
y_train = np.array(df_cur['CurName_simplified'])#[:-1]

In [62]:
X_train.shape

(42766, 50)

In [63]:
# Initialize the Decision Tree Classifier
clfArchFam = DecisionTreeClassifier(random_state=30)

In [64]:
# Fit the model on the training data
clfArchFam.fit(X_train, y_train)

## Prediction of Uncurated

### Predict and de-code

In [65]:
sp = spm.SentencePieceProcessor(model_file='curnames.model')

In [66]:
UNcur_pred3 = UNcur_predFut
print(len(UNcur_predFut))
for key, value in filtered_dict.items():
    #some len(UNcur_dict['234922']) == 462
    if (len(UNcur_dict[str(key)]) == 450):
        pred_wArchId = clfArchFam.predict(np.array([UNcur_dict[str(key)][:-400]]))
        decoded_pred = ''.join(sp.decode(token) for token in pred_wArchId[0] if token not in [0, -1])
        # Append the key and prediction to the DataFrame
        UNcur_pred3.append({'ArchId': key, 'curName-pred': decoded_pred, 'type-pred': 'DT-pred'})

13


In [67]:
df_UNcur_pred3 = pd.DataFrame(UNcur_pred3)

In [68]:
df_UNcur_pred3['ArchId'] = df_UNcur_pred3['ArchId'].astype(str)

In [69]:
df_UNcur_predArchFam = pd.merge(df_UNcur, df_UNcur_pred3, on='ArchId', how='left')
# Replace NaN values in 'type-pred' column with 'not predicted'
df_UNcur_predArchFam['type-pred'] = df_UNcur_predArchFam['type-pred'].fillna('not predicted')

In [70]:
#export
df_UNcur_predArchFam.to_csv('../DTresults/SPARCLE_IDS_UNcurated_DTpredictionArchFamOnly.csv', index=False)