## Load the libraries

In [98]:
import os
import pandas as pd
import math 
from collections import defaultdict 
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
import pickle 
DATA_DIR="../data"

## Read the data 

In [2]:
# Access &read the file 
file_path = os.path.join(DATA_DIR, "SPARCLE_IDS_curated_simplified.mtx4ml.tsv.gz")
df_one_hot = pd.read_csv(file_path, sep='\t', compression='gzip')

## Data filtering 

1. Exclude "superfamilyarch" features
2. Exclude the classes in which there are less than 50 entries

In [15]:
df_filter = df_one_hot.drop(df_one_hot.filter(like='superfamilyarch_').columns, axis=1)

In [16]:
df_filter.shape

(38167, 28055)

In [17]:
df_one_hot.shape

(38167, 48297)

In [21]:
# Count occurrences of each unique value in 'CurName_simplified'
count_series = df_filter.groupby('CurName_simplified')['CurName_simplified'].transform('count')
df_filter50 = df_filter[count_series >= 50]


In [22]:
df_filter50.shape

(7881, 28055)

In [23]:
df_filter50 = df_filter50.loc[:, (df_filter50 != 0).any(axis=0)]

In [24]:
df_filter50.shape

(7881, 4591)

In [99]:
file_path = '../data/df_filter50.pickle'
# Pickle dump
with open(file_path, 'wb') as file:
    pickle.dump(df_filter50, file)

In [120]:
count50 = df_filter50['CurName_simplified'].nunique()
count50

75

In [32]:
X = df_filter50.drop(['CurName_simplified'], axis=1)  # Features
y = df_filter50['CurName_simplified']  # Target variable

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (6304, 4590)
Testing set size: (1577, 4590)


In [34]:
#test access
X_train.iloc[[29]]
y_train.iloc[[29]]

28872    RING finger protein
Name: CurName_simplified, dtype: object

## Fit desision tree model to the data 

In [37]:
# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

In [38]:
%%timeit
# Fit the model on the training data
clf.fit(X_train, y_train)

22.9 s ± 370 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
# Make predictions and evaluate the model
predictions = clf.predict(X_test)

In [45]:
predictions

array(['protein kinase family protein',
       'glycoside hydrolase family protein', 'type I polyketide synthase',
       ..., 'GNAT family N-acetyltransferase',
       'protein kinase family protein',
       'zinc finger and BTB domain-containing protein'], dtype=object)

In [46]:
# Evaluate the model (e.g.: accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.5840202916930881


## Fit RandomForest model to the data 

In [64]:
%%time
# Initialize the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the classifier on the training set
random_forest.fit(X_train, y_train)

CPU times: user 23 s, sys: 239 ms, total: 23.2 s
Wall time: 23.3 s


In [65]:
# Make predictions on the test set
y_pred = random_forest.predict(X_test)

In [66]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.62


In [110]:
file_path = '../data/df_filter50.rf.pickle'
# Pickle dump
with open(file_path, 'wb') as file:
    pickle.dump(random_forest, file)

## Fit C-Support Vector Classification model to the data 

In [71]:
%%time
svm_clf_poly = SVC(kernel='poly', random_state=42)
mod_svm_poly = svm_clf_poly.fit(X_train, y_train)

CPU times: user 55.4 s, sys: 745 ms, total: 56.1 s
Wall time: 56.4 s


In [75]:
# Make predictions on the test set
y_pred_svc = mod_svm_poly.predict(X_test)

In [76]:
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred_svc)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.49


In [100]:
column_names = X_test.columns

In [101]:
file_path = '../data/df_filter50.X.column.pickle'
# Pickle dump
with open(file_path, 'wb') as file:
    pickle.dump(column_names, file)

## How to use the model 

In [121]:
def generate_name(model2use, column_names, elements_to_check):
    model2use = pickle.load(open(model2use, 'rb'))
    column_names =  pickle.load(open(column_names, 'rb'))
    elements_to_check = ["SpecificArch_" + element for element in elements_to_check]
    result_list = [1 if any(element in col for element in elements_to_check) else 0 for col in column_names]
    df = pd.DataFrame([result_list], columns=column_names)
    if sum(result_list) ==0:
        return "Unknown"
    else:
        return model2use.predict(df)[0]

In [122]:
random_forest_file = '../data/df_filter50.rf.pickle'
feature_name = '../data/df_filter50.X.column.pickle'

In [123]:
ele2check = ["CHL00001", "CHL00023"]
generate_name(random_forest_file, feature_name, ele2check)

'protein kinase family protein'

In [124]:
ele2check = ["CHL00001", "XX"]
generate_name(random_forest_file, feature_name, ele2check)

'protein kinase family protein'

In [125]:
ele2check = ["YY", "XX"]
generate_name(random_forest_file, feature_name, ele2check)

'Unknown'